# Thesis Test: aldi_gmm_dyn_none_daily

Discord Detector ALDI with following features:
- Use of GMM
- Dynamic selecting left components
- No PSU consideration 

In [1]:
# import yout ALDI version here
from aldi_gmm_dyn_none_both import ALDI

aldi_id = '07'
aldi_name = 'aldi_gmm_dyn_none_daily'

# Set experiment's parameter
#dict_param_exp = {'exp_id' : 0, 'gmm_max_comp' : 2, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 1, 'gmm_max_comp' : 3, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 2, 'gmm_max_comp' : 4, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 3, 'gmm_max_comp' : 5, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 4, 'gmm_max_comp' : 6, 'gmm_data': 'D'}
dict_param_exp = {'exp_id' : 5, 'gmm_max_comp' : 7, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 6, 'gmm_max_comp' : 8, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 7, 'gmm_max_comp' : 9, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 8, 'gmm_max_comp' : 10, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 9, 'gmm_max_comp' : 15, 'gmm_data': 'D'}
#dict_param_exp = {'exp_id' : 10, 'gmm_max_comp' : 30, 'gmm_data': 'D'}

In [2]:
# LOAD LIBRARIES

import time
import logging

from datetime import datetime
from utils import *
from data_import_ashrae import DataImportAshrae
from aldi_evaluation_metrics import AldiEvaluationMetrics

In [3]:
# AUXILLIARY VARIABLES

myDataImport = DataImportAshrae()
myEvalMetrics = AldiEvaluationMetrics()

list_site_id = list(range(0,16)) 
list_site_name = [f'Site {i}' for i in list_site_id]
meter_type = 0

agg_method = 'majority'

curr_timestamp = datetime.today().strftime('%Y%m%d-%H%M')

dict_all_pred_labels = {}
dict_all_pred_labels_pred = {}
dict_all_true_labels_or = {}
dict_all_true_labels_and = {}
dict_all_true_labels_majo = {}
dict_all_true_labels_majoplus = {}
dict_all_roc_auc_or = {}
dict_all_roc_auc_and = {}
dict_all_roc_auc_majo = {}
dict_all_roc_auc_majoplus = {}

In [4]:
# PREPARE LOGGING TECHNIQUE

# create logger
logger = logging.getLogger(aldi_name)
logger.setLevel(logging.DEBUG)

# create console handler and set level to info
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create file handler and set level to info
fh = logging.FileHandler(filename=f'10_thesis_results/{aldi_id}_{aldi_name}/{aldi_id}_{aldi_name}.log')
fh.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s, %(name)s, %(levelname)s: %(message)s',
                              datefmt='%m/%d/%Y %I:%M:%S %p')

# add formatter to ch & fh
ch.setFormatter(formatter)
fh.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)
logger.addHandler(fh)

logger.info(aldi_name + ' starts its experiments now')

10/20/2021 10:56:51 AM, aldi_gmm_dyn_none_daily, INFO: aldi_gmm_dyn_none_daily starts its experiments now


In [5]:
# LOAD ENERGY CONSUMPTION DATA AND META DATA

df_metadata = myDataImport.get_meta_data()
df_timestamps = myDataImport.get_timestamps()

In [6]:
# RUN DISCORD DETECTOR ALDI

glb_start_time = time.time()
for site_id in list_site_id:
    logger.debug(f'Discord calculation: Run {list_site_name[site_id]}')
    
    # Select relevant energy consumption data + meta data
    df_site_meter = myDataImport.get_meter_data([meter_type], [site_id])
        
    # execute aldi
    aldi = ALDI(df_meters = df_site_meter, 
                df_metadata = df_metadata, 
                m = 24, 
                col_id = 'building_id', 
                site_id=site_id, 
                meter_id=meter_type,
                verbose=False, 
                gpu=False,
                hourly_processing=False,
                aldi_name=aldi_name)
    aldi.set_gmm_model(gmm_data = dict_param_exp['gmm_data'],
                       gmm_max_comp = dict_param_exp['gmm_max_comp'])
    
    # request predicted discord label from aldi
    df_pred_labels = aldi.get_result_df(forecast_out = False)
    
    # request predict discord label for predictor
    df_pred_labels_pred = aldi.get_result_df(forecast_out = True)
    
    # keep track of all sites' predicted discord labels
    dict_all_pred_labels[list_site_name[site_id]] = df_pred_labels
    dict_all_pred_labels_pred[list_site_name[site_id]] = df_pred_labels_pred

# adjustment to ensure that the labels have the correct format for timekeeping
df_all_pred_labels = pd.concat(dict_all_pred_labels, axis=1)
df_all_pred_labels.columns = df_all_pred_labels.columns.get_level_values(1)
df_ref_timestamps = pd.DataFrame(df_timestamps.timestamp)
df_ref_timestamps['timestamp'] = df_ref_timestamps['timestamp'].dt.date
df_all_pred_labels['timestamp'] = df_all_pred_labels.index.date
df_all_pred_label_hourly = pd.merge(df_ref_timestamps, df_all_pred_labels, on='timestamp')
    
# calculate runtime and store the value in a log file
runtime_sec = time.time() - glb_start_time
logger.info(f'Runtime: {round(runtime_sec, 4)} sec')
logger.info(f'Runtime: {round(runtime_sec / 60 , 4) } min')
logger.info(f'{aldi_name} ends discord label caluclation')

10/20/2021 10:57:10 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 0
10/20/2021 10:57:30 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 1
10/20/2021 10:57:40 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 2
10/20/2021 10:58:05 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 3
10/20/2021 10:58:29 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 4
10/20/2021 10:58:46 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 5
10/20/2021 10:59:03 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 6
10/20/2021 10:59:10 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 7
10/20/2021 10:59:14 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 8
10/20/2021 10:59:27 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 9
10/20/2021 10:59:49 AM, aldi_gmm_dyn_none_daily, DEBUG: Discord calculation: Run Site 10
10/20/2021 10:59:56 AM, aldi_gm

In [7]:
# CALCULATE DIFFERENT METRICS

# 1. SITE WISE METRICS
for site_id in list_site_id:
    logger.debug(f'Evaluation: Run {list_site_name[site_id]}')
    
    # load predicted sites' discord label (daily)
    df_curr_pred_label_daily = dict_all_pred_labels[list_site_name[site_id]]
    
    # load true sites' discord label + aggregate
    df_curr_true_labels_hourly = myDataImport.get_label_data([meter_type], [site_id])
    
    df_curr_true_labels_or_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                        agg_method='logic_or')
    df_curr_true_labels_and_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                         agg_method='logic_and')
    df_curr_true_labels_majo_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                          agg_method='majority')
    df_curr_true_labels_majoplus_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                              agg_method='majority_plus')
    
    # keep track of all aggregate true labels
    dict_all_true_labels_or[list_site_name[site_id]] = df_curr_true_labels_or_daily
    dict_all_true_labels_and[list_site_name[site_id]] = df_curr_true_labels_and_daily
    dict_all_true_labels_majo[list_site_name[site_id]] = df_curr_true_labels_majo_daily
    dict_all_true_labels_majoplus[list_site_name[site_id]] = df_curr_true_labels_majoplus_daily
    
        
    
    # Calculate ROC AUC metric per site & aggregation method
    df_roc_auc_or = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_or_daily,
                                              df_pred=df_curr_pred_label_daily)
    df_roc_auc_and = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_and_daily,
                                               df_pred=df_curr_pred_label_daily)
    df_roc_auc_majo = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_majo_daily,
                                                df_pred=df_curr_pred_label_daily)
    df_roc_auc_majoplus = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_majoplus_daily,
                                                    df_pred=df_curr_pred_label_daily)
    
    # Keep track of all ROC AUC metric values
    dict_all_roc_auc_or[list_site_name[site_id]] = df_roc_auc_or
    dict_all_roc_auc_and[list_site_name[site_id]] = df_roc_auc_and
    dict_all_roc_auc_majo[list_site_name[site_id]] = df_roc_auc_majo
    dict_all_roc_auc_majoplus[list_site_name[site_id]] = df_roc_auc_majoplus
    
    # Prepare Confusion matrix/class report per site & aggregation method
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_or_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/or-exp_id{dict_param_exp["exp_id"]}')
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_and_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/and-exp_id{dict_param_exp["exp_id"]}')
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_majo_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/majo-exp_id{dict_param_exp["exp_id"]}')
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_majoplus_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/majoplus-exp_id{dict_param_exp["exp_id"]}')

10/20/2021 11:01:11 AM, aldi_gmm_dyn_none_daily, DEBUG: Evaluation: Run Site 0
10/20/2021 11:01:15 AM, aldi_gmm_dyn_none_daily, DEBUG: Evaluation: Run Site 1
10/20/2021 11:01:18 AM, aldi_gmm_dyn_none_daily, DEBUG: Evaluation: Run Site 2
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
10/20/2021 11:01:22 AM, aldi_gmm_dyn_none_daily, DEBUG: Evaluation: Run Site 3
  _warn_prf(average, mod

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

In [8]:
# 2. DATASET WISE METRICS
logger.debug(f'Evaluation: dataset wide metrics/results')

# store all ROC AUC metric values
df_all_roc_auc_or = pd.DataFrame.from_dict(dict_all_roc_auc_or, orient='index', columns=['roc_auc'])
df_all_roc_auc_or.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/or-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')

df_all_roc_auc_and = pd.DataFrame.from_dict(dict_all_roc_auc_and, orient='index', columns=['roc_auc'])
df_all_roc_auc_and.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/and-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')

df_all_roc_auc_majo = pd.DataFrame.from_dict(dict_all_roc_auc_majo, orient='index', columns=['roc_auc'])
df_all_roc_auc_majo.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/majo-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')

df_all_roc_auc_majoplus = pd.DataFrame.from_dict(dict_all_roc_auc_majoplus, orient='index', columns=['roc_auc'])
df_all_roc_auc_majoplus.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/majoplus-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')


# create a single confusion matrix for each agg method
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_or, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/or-exp_id{dict_param_exp["exp_id"]}')
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_and, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/and-exp_id{dict_param_exp["exp_id"]}')
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_majo, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/majo-exp_id{dict_param_exp["exp_id"]}')
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_majoplus, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/majoplus-exp_id{dict_param_exp["exp_id"]}')

10/20/2021 11:02:08 AM, aldi_gmm_dyn_none_daily, DEBUG: Evaluation: dataset wide metrics/results


'              precision    recall  f1-score   support\n\n           0       0.94      0.31      0.46    469166\n           1       0.11      0.82      0.19     47992\n\n    accuracy                           0.36    517158\n   macro avg       0.53      0.57      0.33    517158\nweighted avg       0.87      0.36      0.44    517158\n'

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

In [9]:
# 3. BUILD EXPORTABLE FILE FOR PREDICTOR

df_export_labels = pd.concat(dict_all_pred_labels_pred, axis=0)
df_export_labels['meter'] = [0] * df_export_labels.shape[0]
df_export_labels.index = df_export_labels.index.get_level_values(1)

df_left_keys = myDataImport.get_timestamps_buildings(resolution='H')
df_exportable = pd.merge(df_left_keys, df_export_labels, how="left", on=["timestamp", "building_id", "meter"])

# Attention: NaNs are padded with 0
df_exportable = df_exportable.fillna(0)
df_exportable['is_discord'] = df_exportable['is_discord'].astype('int8')

print(df_exportable.shape)

# Export
df_exportable['is_discord'].to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/{curr_timestamp}-discords-exp_id{dict_param_exp["exp_id"]}.csv', index=False) 

(20216100, 4)
