# Thesis Test: aldi_none_none_none_daily

Discord Detector ALDI with following features:
- No use of GMM
- No use of selecting GMM components technique
- No PSU consideration

In [1]:
# import yout ALDI version here
from aldi_none_none_none_both import ALDI

aldi_id = '01'
aldi_name = 'aldi_none_none_none_daily'

# Set experiment's parameter
dict_param_exp = {'exp_id' : 0, 'p_value' : 0.001}
#dict_param_exp = {'exp_id' : 1, 'p_value' : 0.005}
# dict_param_exp = {'exp_id' : 2, 'p_value' : 0.01}
#dict_param_exp = {'exp_id' : 3, 'p_value' : 0.05}
#dict_param_exp = {'exp_id' : 4, 'p_value' : 0.1}

In [2]:
# LOAD LIBRARIES

import time
import logging

from datetime import datetime
from utils import *
from data_import_ashrae import DataImportAshrae
from aldi_evaluation_metrics import AldiEvaluationMetrics

In [10]:
# AUXILLIARY VARIABLES

myDataImport = DataImportAshrae()
myEvalMetrics = AldiEvaluationMetrics()

list_site_id = list(range(0,16)) 
list_site_id = [0] # DEBUG
list_site_name = [f'Site {i}' for i in list_site_id]
meter_type = 0

agg_method = 'majority'

curr_timestamp = datetime.today().strftime('%Y%m%d-%H%M')

dict_all_pred_labels = {}
dict_all_pred_labels_pred = {}
dict_all_true_labels_or = {}
dict_all_true_labels_and = {}
dict_all_true_labels_majo = {}
dict_all_true_labels_majoplus = {}
dict_all_roc_auc_or = {}
dict_all_roc_auc_and = {}
dict_all_roc_auc_majo = {}
dict_all_roc_auc_majoplus = {}

In [11]:
# PREPARE LOGGING TECHNIQUE

# create logger
logger = logging.getLogger(aldi_name)
logger.setLevel(logging.DEBUG)

# create console handler and set level to info
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create file handler and set level to info
fh = logging.FileHandler(filename=f'10_thesis_results/{aldi_id}_{aldi_name}/{aldi_id}_{aldi_name}.log')
fh.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s, %(name)s, %(levelname)s: %(message)s',
                              datefmt='%m/%d/%Y %I:%M:%S %p')

# add formatter to ch & fh
ch.setFormatter(formatter)
fh.setFormatter(formatter)

# add ch to logger
logger.addHandler(ch)
logger.addHandler(fh)

logger.info(aldi_name + ' starts its experiments now')

10/27/2021 03:41:43 PM, aldi_none_none_none_daily, INFO: aldi_none_none_none_daily starts its experiments now
10/27/2021 03:41:43 PM, aldi_none_none_none_daily, INFO: aldi_none_none_none_daily starts its experiments now


In [12]:
# LOAD ENERGY CONSUMPTION DATA AND META DATA

df_metadata = myDataImport.get_meta_data()
df_timestamps = myDataImport.get_timestamps()

In [15]:
# RUN DISCORD DETECTOR ALDI

glb_start_time = time.time()
for site_id in list_site_id:
    logger.debug(f'Discord calculation: Run {list_site_name[site_id]}')
    
    # Select relevant energy consumption data + meta data
    df_site_meter = myDataImport.get_meter_data([meter_type], [site_id])
        
    # execute aldi
    aldi = ALDI(df_meters = df_site_meter, 
                df_metadata = df_metadata, 
                m = 24, 
                col_id = 'building_id', 
                site_id=site_id, 
                meter_id=meter_type,
                test_type='ks',
                use_iqr=False,
                iqr=0.25,
                verbose=False, 
                gpu=False,
                hourly_processing=False,
                aldi_name=aldi_name)
    
    # request predicted discord label from aldi
    df_pred_labels = aldi.get_result_df(p_value = dict_param_exp['p_value'],
                                       forecast_out = False)
    
    # request predict discord label for predictor
    df_pred_labels_pred = aldi.get_result_df(p_value = dict_param_exp['p_value'],
                                             forecast_out = True)
    
    # keep track of all sites' predicted discord labels
    dict_all_pred_labels[list_site_name[site_id]] = df_pred_labels
    dict_all_pred_labels_pred[list_site_name[site_id]] = df_pred_labels_pred

print(dict_all_pred_labels_pred)
# adjustment to ensure that the labels have the correct format for timekeeping
df_all_pred_labels = pd.concat(dict_all_pred_labels, axis=1)
df_all_pred_labels.columns = df_all_pred_labels.columns.get_level_values(1)
df_ref_timestamps = pd.DataFrame(df_timestamps.timestamp)
df_ref_timestamps['timestamp'] = df_ref_timestamps['timestamp'].dt.date
df_all_pred_labels['timestamp'] = df_all_pred_labels.index.date
df_all_pred_label_hourly = pd.merge(df_ref_timestamps, df_all_pred_labels, on='timestamp')
    
# calculate runtime and store the value in a log file
runtime_sec = time.time() - glb_start_time
logger.info(f'Runtime: {round(runtime_sec, 4)} sec')
logger.info(f'Runtime: {round(runtime_sec / 60 , 4) } min')
logger.info(f'{aldi_name} ends discord label caluclation')

10/27/2021 03:48:47 PM, aldi_none_none_none_daily, DEBUG: Discord calculation: Run Site 0
10/27/2021 03:48:47 PM, aldi_none_none_none_daily, DEBUG: Discord calculation: Run Site 0
10/27/2021 03:49:07 PM, aldi_none_none_none_daily, INFO: Runtime: 20.8104 sec
10/27/2021 03:49:07 PM, aldi_none_none_none_daily, INFO: Runtime: 20.8104 sec
10/27/2021 03:49:08 PM, aldi_none_none_none_daily, INFO: Runtime: 0.3468 min
10/27/2021 03:49:08 PM, aldi_none_none_none_daily, INFO: Runtime: 0.3468 min
10/27/2021 03:49:08 PM, aldi_none_none_none_daily, INFO: aldi_none_none_none_daily ends discord label caluclation
10/27/2021 03:49:08 PM, aldi_none_none_none_daily, INFO: aldi_none_none_none_daily ends discord label caluclation


{'Site 0':                  timestamp building_id  is_discord
0      2016-01-01 00:00:00           0           1
1      2016-01-01 01:00:00           0           1
2      2016-01-01 02:00:00           0           1
3      2016-01-01 03:00:00           0           1
4      2016-01-01 04:00:00           0           1
...                    ...         ...         ...
922315 2016-12-31 19:00:00          48           1
922316 2016-12-31 20:00:00          48           1
922317 2016-12-31 21:00:00          48           1
922318 2016-12-31 22:00:00          48           1
922319 2016-12-31 23:00:00          48           1

[922320 rows x 3 columns]}


In [7]:
# CALCULATE DIFFERENT METRICS

# 1. SITE WISE METRICS
for site_id in list_site_id:
    logger.debug(f'Evaluation: Run {list_site_name[site_id]}')
    
    # load predicted sites' discord label (daily)
    df_curr_pred_label_daily = dict_all_pred_labels[list_site_name[site_id]]
    
    # load true sites' discord label + aggregate
    df_curr_true_labels_hourly = myDataImport.get_label_data([meter_type], [site_id])
    
    df_curr_true_labels_or_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                        agg_method='logic_or')
    df_curr_true_labels_and_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                         agg_method='logic_and')
    df_curr_true_labels_majo_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                          agg_method='majority')
    df_curr_true_labels_majoplus_daily = get_daily_resolution(df_hourly_data=df_curr_true_labels_hourly, 
                                                              agg_method='majority_plus')
    
    # keep track of all aggregate true labels
    dict_all_true_labels_or[list_site_name[site_id]] = df_curr_true_labels_or_daily
    dict_all_true_labels_and[list_site_name[site_id]] = df_curr_true_labels_and_daily
    dict_all_true_labels_majo[list_site_name[site_id]] = df_curr_true_labels_majo_daily
    dict_all_true_labels_majoplus[list_site_name[site_id]] = df_curr_true_labels_majoplus_daily
    
        
    
    # Calculate ROC AUC metric per site & aggregation method
    df_roc_auc_or = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_or_daily,
                                              df_pred=df_curr_pred_label_daily)
    df_roc_auc_and = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_and_daily,
                                               df_pred=df_curr_pred_label_daily)
    df_roc_auc_majo = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_majo_daily,
                                                df_pred=df_curr_pred_label_daily)
    df_roc_auc_majoplus = myEvalMetrics.get_roc_auc(df_true=df_curr_true_labels_majoplus_daily,
                                                    df_pred=df_curr_pred_label_daily)
    
    # Keep track of all ROC AUC metric values
    dict_all_roc_auc_or[list_site_name[site_id]] = df_roc_auc_or
    dict_all_roc_auc_and[list_site_name[site_id]] = df_roc_auc_and
    dict_all_roc_auc_majo[list_site_name[site_id]] = df_roc_auc_majo
    dict_all_roc_auc_majoplus[list_site_name[site_id]] = df_roc_auc_majoplus
    
    # Prepare Confusion matrix/class report per site & aggregation method
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_or_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/or-exp_id{dict_param_exp["exp_id"]}')
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_and_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/and-exp_id{dict_param_exp["exp_id"]}')
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_majo_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/majo-exp_id{dict_param_exp["exp_id"]}')
    myEvalMetrics.get_class_report(df_true = df_curr_true_labels_majoplus_daily,
                                   df_pred = df_curr_pred_label_daily, 
                                   aldi_impl = aldi_name, 
                                   level_name = list_site_name[site_id], 
                                   meter_type = meter_type, 
                                   path = f'10_thesis_results/{aldi_id}_{aldi_name}/majoplus-exp_id{dict_param_exp["exp_id"]}')

10/27/2021 03:40:04 PM, aldi_none_none_none_daily, DEBUG: Evaluation: Run Site 0
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

In [8]:
# 2. DATASET WISE METRICS
logger.debug(f'Evaluation: dataset wide metrics/results')

# store all ROC AUC metric values
df_all_roc_auc_or = pd.DataFrame.from_dict(dict_all_roc_auc_or, orient='index', columns=['roc_auc'])
df_all_roc_auc_or.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/or-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')

df_all_roc_auc_and = pd.DataFrame.from_dict(dict_all_roc_auc_and, orient='index', columns=["roc_auc"])
df_all_roc_auc_and.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/and-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')

df_all_roc_auc_majo = pd.DataFrame.from_dict(dict_all_roc_auc_majo, orient='index', columns=['roc_auc'])
df_all_roc_auc_majo.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/majo-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')

df_all_roc_auc_majoplus = pd.DataFrame.from_dict(dict_all_roc_auc_majoplus, orient='index', columns=['roc_auc'])
df_all_roc_auc_majoplus.to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/majoplus-exp_id{dict_param_exp["exp_id"]}/{curr_timestamp}-roc_auc.csv')


# create a single confusion matrix for each agg method
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_or, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/or-exp_id{dict_param_exp["exp_id"]}')
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_and, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/and-exp_id{dict_param_exp["exp_id"]}')
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_majo, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/majo-exp_id{dict_param_exp["exp_id"]}')
myEvalMetrics.get_class_report(df_true = pd.concat(dict_all_true_labels_majoplus, axis=1), 
                               df_pred = pd.concat(dict_all_pred_labels, axis=1), 
                               aldi_impl = aldi_name, 
                               level_name = 'all', 
                               meter_type = meter_type,
                               path = f'10_thesis_results/{aldi_id}_{aldi_name}/majoplus-exp_id{dict_param_exp["exp_id"]}')

10/27/2021 03:40:12 PM, aldi_none_none_none_daily, DEBUG: Evaluation: dataset wide metrics/results
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00     23125\n           1       0.40      1.00      0.57     15305\n\n    accuracy                           0.40     38430\n   macro avg       0.20      0.50      0.28     38430\nweighted avg       0.16      0.40      0.23     38430\n'

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

In [9]:
# 3. BUILD EXPORTABLE FILE FOR PREDICTOR

df_export_labels = pd.concat(dict_all_pred_labels_pred, axis=0)
df_export_labels['meter'] = [0] * df_export_labels.shape[0]
df_export_labels.index = df_export_labels.index.get_level_values(1)


print(df_export_labels)

df_left_keys = myDataImport.get_timestamps_buildings(resolution='H')
df_exportable = pd.merge(df_left_keys, df_export_labels, how="left", on=["timestamp", "building_id", "meter"])

# Attention: NaNs are padded with 0
df_exportable = df_exportable.fillna(0)
df_exportable['is_discord'] = df_exportable['is_discord'].astype('int8')

print(df_exportable.shape)

# Export
df_exportable['is_discord'].to_csv(f'10_thesis_results/{aldi_id}_{aldi_name}/{curr_timestamp}-discords-exp_id{dict_param_exp["exp_id"]}.csv', index=False)  

                 timestamp building_id  is_discord  meter
0      2016-01-01 00:00:00           0           1      0
1      2016-01-01 01:00:00           0           1      0
2      2016-01-01 02:00:00           0           1      0
3      2016-01-01 03:00:00           0           1      0
4      2016-01-01 04:00:00           0           1      0
...                    ...         ...         ...    ...
922315 2016-12-31 19:00:00          48           1      0
922316 2016-12-31 20:00:00          48           1      0
922317 2016-12-31 21:00:00          48           1      0
922318 2016-12-31 22:00:00          48           1      0
922319 2016-12-31 23:00:00          48           1      0

[922320 rows x 4 columns]
(20216100, 4)
