In [35]:
import nilmtk as nilmtk
from nilmtk import DataSet, MeterGroup, Appliance
from nilmtk.metergroup import MeterGroupID
from nilmtk.elecmeter import ElecMeter, ElecMeterID 
from typing import List, Tuple, Dict
import pandas as pd
from pandas import DataFrame
from fuzzywuzzy import fuzz
import numpy as np
import math
import time

#REDD = os.path.join(dirname, r"datasource\dataset\REDD\redd.h5")
redd = DataSet('redd.h5')
SITE_METER = 'Site meter'

### House 1 

In [36]:
year = '2011'
train_month_end = '5'
train_month_start = '4'
train_end_date = "{}-17-{}".format(train_month_end, year)
train_start_date = "{}-18-{}".format(train_month_start, year)
test_month_end = '5'
test_month_start = '5'
test_end_date = "{}-25-{}".format(test_month_end, year)
test_start_date = "{}-18-{}".format(test_month_start, year)
#appliances for [1]
appliances_redd1_taba = ['electric oven', 'fridge', 'light', 'microwave', 'washer dryer', 'unknown', 'sockets']
##appliances for [2]
appliances_redd1_nalm = ['electric oven', 'fridge', 'light', 'microwave', 'unknown', 'sockets']

building = 1

In [37]:
def timing(t: str):
    print('TIMING: ' + t)
    print()

    
def info(i: str):
    print('INFO: ' + i)
    
def debug(d):
    print('DEBUG: ' + d)

## Read data using nilmtk
* Read selected data
* Label data
* Bucketize data (delay embedding)


In [38]:
def get_selected_metergroup(dataset: DataSet, building: int, appliances: List,  start: str, end: str, 
                               sample_period = 3, include_mains=True):
    """
    Get the MeterGroup of selected appliances
    Return the MeterGroup
    """
    start_time = time.time()
    info('get_selected_metergroup() starts')
    dataset.set_window(start=start, end=end)
    elec = dataset.buildings[building].elec
    appliances_with_one_meter = []
    appliances_with_more_meters = []
    for appliance in appliances:
        metergroup = elec.select_using_appliances(type=appliances)
        if len(metergroup.meters) > 1:
            appliances_with_more_meters.append(appliance)
        else:
            appliances_with_one_meter.append(appliance)

    special_metergroup = None
    for appliance in appliances_with_more_meters:
        inst = 1
        if appliance == 'sockets' and building == 3:
            inst = 4
        if special_metergroup is None:
            special_metergroup = elec.select_using_appliances(type=appliance, instance=inst)
        else:
            special_metergroup = special_metergroup.union(elec.select_using_appliances(type=appliance, instance=1))

    selected_metergroup = elec.select_using_appliances(type=appliances_with_one_meter)
    selected_metergroup = selected_metergroup.union(special_metergroup)
    if include_mains:
        mains_meter = dataset.buildings[building].elec.mains()
        if isinstance(mains_meter, MeterGroup):
            if len(mains_meter.meters) > 1:
                mains_meter = mains_meter.meters[0]
                mains_metergroup = MeterGroup(meters=[mains_meter])
            else:
                mains_metergroup = mains_meter
        else:
            mains_metergroup = MeterGroup(meters=[mains_meter])
        selected_metergroup = selected_metergroup.union(mains_metergroup)
    info('get_selected_metergroup() finished')
    print(selected_metergroup)
    timing('{}'.format(round(time.time() - start_time, 2)))
    return selected_metergroup

In [39]:
def read_selected_appliances(dataset: DataSet, building: int, appliances : List, start: str, end: str, include_mains,
                               sample_period = 3):
    """
    Read and fill in missing values of selected appliances in a given bulding
    Return the DataFrame of selected appliances in which columns are ElecMeter IDs,
    and values are the PC of the meter in given sample period
    """
    start_time = time.time()
    info('read_selected_appliances() starts')
    selected_metergroup = get_selected_metergroup(dataset, building, appliances, start, end,sample_period,include_mains)
    df = selected_metergroup.dataframe_of_meters(sample_period=sample_period)
    df.fillna(0, inplace=True)
    info('read_selected_appliances() finished')
    timing('{}'.format(round(time.time() - start_time, 2)))
    return df, selected_metergroup

In [40]:
def all_meters(dataset: DataSet, building: int, start_date: str, end_date:str, sample_period = 3):
    """
    Read all the meters in given building
    """
    elec = dataset.buildings[building].elec
    print(elec)
    #redd_datasource = Datasource(redd, "REDD")

In [41]:
def get_labels_df(df: DataFrame, selected_metergroup: MeterGroup) -> [List, Dict]:
    """
    Returns two lists, one is a list of labels which describes DataFrame columns
    Other is a list of power threshold of each appliances
    """
    start_time = time.time()
    info('get_labels_df() starts')
    lst = []
    threshold = {}
    for m in df.columns:
        label = ""
        if isinstance(m,MeterGroupID):
            tup_elecmeterID = m[0]
            lst_elecmeterID = list(tup_elecmeterID)
            #get meter group using list of ElecMeterIDs
            mg = selected_metergroup[lst_elecmeterID]
            #get labels of meter group
            labels = mg.get_labels(lst_elecmeterID)
            label = labels[0]
            threshold[label] =mg.on_power_threshold()
        else:
            #get ElecMeter using ElecMeterID
            elec_meter = selected_metergroup[m]
            #get labels of ElecMeter
            label = elec_meter.label()
            threshold[label] = elec_meter.on_power_threshold()
        lst += [label]
    info('get_labels_df() finished')
    print(lst)
    timing('{}'.format(round(time.time() - start_time, 2)))
        
    return lst, threshold

In [42]:
def get_dic_real_power(df: DataFrame, labels: List) -> Dict:
    """
    Returns a Dictionary in which key is name of the appliance, and value is the power consumption the appliance
    """
    start_time = time.time()
    info('get_dic_real_power() starts')
    Dict = {}
    lst = []
    for k, v in df.items():
        lst += [v]
        
    for i in range(len(labels)):
        Dict[labels[i]] = lst[i]
    info('get_dic_real_power() finished')
    print(Dict)
    timing('{}'.format(round(time.time() - start_time, 2)))    
    return Dict

In [43]:
def get_dic_labeled_power(RealPower: Dict, threshold: Dict) -> Dict:
    """
    Returns a Dictionary in which key is name of the appliance, and value is 
    """
    start_time = time.time()
    info('get_dic_labeled_power() starts')
    LabeledPower = {}
    for appliance, real_power in RealPower.items():
        if appliance != 'Site meter':
            arr = create_labels(real_power, threshold[appliance])
            LabeledPower[appliance] = arr
    info('get_dic_labeled_power() finished')
    print(LabeledPower)
    timing('{}'.format(round(time.time() - start_time, 2)))
    return LabeledPower

In [44]:
def create_labels(array, threshold):
    res = np.empty(array.shape)
    for i in range(len(array)):
        if array[i] >= threshold:
            res[i] = 1
        else:
            res[i] = 0
    return list(res)

In [45]:


def normalize_columns(df: DataFrame, meter_group: MeterGroup, appliance_names: List[str]) -> Tuple[DataFrame, dict]:
    """
    It normalizes the names of the columns for compatibility.
    Args:
        df (DataFrame):
        meter_group (MeterGroup):
        appliance_names (List[str]):
    Returns:
        A tuple with a DataFrame and a dictionary mapping labels to ids.
    """
    labels = meter_group.get_labels(df.columns)
    normalized_labels = []
    info(f"Df columns before normalization {df.columns}")
    info(f"Labels before normalization {labels}")

    for label in labels:
        if label == SITE_METER and SITE_METER not in appliance_names:
            normalized_labels.append(SITE_METER)
            continue
        for name in appliance_names:
            ratio = fuzz.ratio(label.lower().replace('electric', "").lstrip().rstrip().split()[0],
                                   name.lower().replace('electric', "").lstrip().rstrip().split()[0])
            if ratio > 90:
                info(f"{name} ~ {label} ({ratio}%)")
                normalized_labels.append(name)
    if len(normalized_labels) != len(labels):
        debug(f"len(normalized_labels) {len(normalized_labels)} != len(labels) {len(labels)}")
        raise LabelNormalizationError()
    label2id = {l: i for l, i in zip(normalized_labels, df.columns)}
    df.columns = normalized_labels
    info(f"Normalized labels {normalized_labels}")
    return df, label2id

In [46]:
 def get_site_meter_data(df: DataFrame) -> np.ndarray:
    """
    Get the data of the site meter from the given DataFrame.
    Args:
        df (DataFrame): A DataFrame containing energy data with columns corresponding to different meters.
    Returns:
        The site meter data as an array (ndarray).
    """
    for col in df.columns:
        if SITE_METER in col:
            return df[col].values
    raise NoSiteMeterException("Couldn' t find site meter.")

In [47]:
def get_multilabels(labels_df: DataFrame, appliances: List = None) -> DataFrame:
    """
    Get the labels of the specified appliances.
    Args:
        labels_df (DataFrame):
        appliances (List):
    Returns:
    """
    debug(f"get_multilabels  labels_df.columns {labels_df.columns}")
    debug(f"get_multilabels  appliances {appliances}")
    if appliances is None:
        return labels_df
    else:
        return labels_df[appliances]

In [48]:
def setup_one_building(dataset: DataSet, appliances, building, start, end, sample_period) -> (pd.DataFrame, MeterGroup, Dict, Dict):
    all_df, metergroup = read_selected_appliances(dataset = dataset, building = building, 
                                                 appliances = appliances, start = start, end = end, include_mains = True, sample_period = sample_period )
    debug(f"Length of data of all loaded meters {len(all_df)}")
    all_df, label2id = normalize_columns(all_df, metergroup, appliances)
    debug(f"Length of data of all loaded meters {len(all_df)}")
    info('Meters that have been loaded (all_df.columns):\n' + str(all_df.columns))
    
    
    return all_df, metergroup, label2id

In [49]:
def create_multilabels_from_meters(meters: DataFrame, meter_group: MeterGroup, labels2id: dict) -> DataFrame:
    """
    Creates multi labels from the given meter group using a dictionary as a lookup table.
    Args:
        meters (DataFrame):
        meter_group (MeterGroup):
        labels2id (dict):
    Returns:
        A DataFrame with the multi labels.
    """
    start_time = time.time()
    labels = dict()
    for col in meters.columns:
        info(f"Creating multilabels from meter {col}, "
             f"\nlabels2id[col] {labels2id[col]}"
             f"\nmetergroup[labels2id[col]] {meter_group[labels2id[col]]}")
        meter = meter_group[labels2id[col]]
        threshold = meter.on_power_threshold()
        vals = meters[col].values.astype(float)
        if vals is None or col == SITE_METER:
            debug(f"Skipping {col} - {vals}")
            continue
        debug(f"meters[col].values.astype(float) {col} - {vals}")
        labels[col] = create_labels(vals, threshold)
    timing('Create multilabels from meters {}'.format(round(time.time() - start_time, 2)))
    return DataFrame(labels)

#### Set up train and test data (house 1) for [1]

In [50]:
train_df_taba1, train_metergroup_taba1, train_label2id_taba1 = setup_one_building(redd, appliances_redd1_taba, building, 
                                                                train_start_date, train_end_date, sample_period = 6)
train_labels_df_taba1 = create_multilabels_from_meters(train_df_taba1, train_metergroup_taba1, train_label2id_taba1)

test_df_taba1, test_metergroup_taba1, test_label2id_taba1 = setup_one_building(redd, appliances_redd1_taba, building, 
                                                                test_start_date, test_end_date, sample_period = 6)
test_labels_df_taba1 = create_multilabels_from_meters(test_df_taba1, test_metergroup_taba1, test_label2id_taba1)

INFO: read_selected_appliances() starts
INFO: get_selected_metergroup() starts
INFO: get_selected_metergroup() finished
MeterGroup(meters=
  ElecMeter(instance=7, building=1, dataset='REDD', appliances=[Appliance(type='sockets', instance=1)])
  MeterGroup(meters=
    ElecMeter(instance=10, building=1, dataset='REDD', appliances=[Appliance(type='washer dryer', instance=1)])
    ElecMeter(instance=20, building=1, dataset='REDD', appliances=[Appliance(type='washer dryer', instance=1)])
  )
  ElecMeter(instance=5, building=1, dataset='REDD', appliances=[Appliance(type='fridge', instance=1)])
  ElecMeter(instance=1, building=1, dataset='REDD', site_meter, appliances=[])
  ElecMeter(instance=12, building=1, dataset='REDD', appliances=[Appliance(type='unknown', instance=1)])
  MeterGroup(meters=
    ElecMeter(instance=3, building=1, dataset='REDD', appliances=[Appliance(type='electric oven', instance=1)])
    ElecMeter(instance=4, building=1, dataset='REDD', appliances=[Appliance(type='electr

INFO: Creating multilabels from meter microwave, 
labels2id[col] ElecMeterID(instance=11, building=1, dataset='REDD')
metergroup[labels2id[col]] ElecMeter(instance=11, building=1, dataset='REDD', appliances=[Appliance(type='microwave', instance=1)])
DEBUG: meters[col].values.astype(float) microwave - [5. 5. 5. ... 0. 0. 0.]
TIMING: Create multilabels from meters 0.27



#### Set up train and test data (house 1) for [2]

In [51]:
train_df_nalm1, train_metergroup_nalm1, train_label2id_nalm1 = setup_one_building(redd, appliances_redd1_nalm, building, 
                                                                train_start_date, train_end_date, sample_period = 6)
train_labels_df_nalm1 = create_multilabels_from_meters(train_df_nalm1, train_metergroup_nalm1, train_label2id_nalm1)

test_df_nalm1, test_metergroup_nalm1, test_label2id_nalm1 = setup_one_building(redd, appliances_redd1_nalm, building, 
                                                                test_start_date, test_end_date, sample_period = 6)
test_labels_df_nalm1 = create_multilabels_from_meters(test_df_nalm1, test_metergroup_nalm1, test_label2id_nalm1)

INFO: read_selected_appliances() starts
INFO: get_selected_metergroup() starts
INFO: get_selected_metergroup() finished
MeterGroup(meters=
  ElecMeter(instance=7, building=1, dataset='REDD', appliances=[Appliance(type='sockets', instance=1)])
  ElecMeter(instance=5, building=1, dataset='REDD', appliances=[Appliance(type='fridge', instance=1)])
  ElecMeter(instance=1, building=1, dataset='REDD', site_meter, appliances=[])
  ElecMeter(instance=12, building=1, dataset='REDD', appliances=[Appliance(type='unknown', instance=1)])
  MeterGroup(meters=
    ElecMeter(instance=3, building=1, dataset='REDD', appliances=[Appliance(type='electric oven', instance=1)])
    ElecMeter(instance=4, building=1, dataset='REDD', appliances=[Appliance(type='electric oven', instance=1)])
  )
  ElecMeter(instance=9, building=1, dataset='REDD', appliances=[Appliance(type='light', instance=1)])
  ElecMeter(instance=11, building=1, dataset='REDD', appliances=[Appliance(type='microwave', instance=1)])
)
TIMING: 0.

TIMING: Create multilabels from meters 0.23



### Time series length

In [59]:
from enum import Enum
class TimeSeriesLength(Enum):
    """
    The length of each segment of the time series, which will be used for inference.
    """
    WINDOW_SAMPLE_PERIOD = 'same'
    WINDOW_1_MIN = '1m'
    WINDOW_5_MINS = '5m'
    WINDOW_10_MINS = '10m'
    WINDOW_30_MINS = '30m'
    WINDOW_1_HOUR = '1h'
    WINDOW_2_HOURS = '2h'
    WINDOW_3_HOURS = '3h'
    WINDOW_4_HOURS = '4h'
    WINDOW_8_HOURS = '8h'
    WINDOW_1_DAY = '1d'
    WINDOW_1_WEEK = '1w'
    
def get_window(dt: TimeSeriesLength, sample_period) -> int:
    choices = {TimeSeriesLength.WINDOW_SAMPLE_PERIOD: 1,
               TimeSeriesLength.WINDOW_1_MIN        : get_no_of_samples_per_min(sample_period),
               TimeSeriesLength.WINDOW_5_MINS       : get_no_of_samples_per_min(sample_period) * 5,
               TimeSeriesLength.WINDOW_10_MINS      : get_no_of_samples_per_min(sample_period) * 10,
               TimeSeriesLength.WINDOW_30_MINS      : get_no_of_samples_per_min(sample_period) * 30,
               TimeSeriesLength.WINDOW_1_HOUR       : get_no_of_samples_per_hour(sample_period),
               TimeSeriesLength.WINDOW_2_HOURS      : get_no_of_samples_per_hour(sample_period) * 2,
               TimeSeriesLength.WINDOW_3_HOURS      : get_no_of_samples_per_hour(sample_period) * 3,
               TimeSeriesLength.WINDOW_4_HOURS      : get_no_of_samples_per_hour(sample_period) * 4,
               TimeSeriesLength.WINDOW_8_HOURS      : get_no_of_samples_per_hour(sample_period) * 8,
               TimeSeriesLength.WINDOW_1_DAY        : get_no_of_samples_per_day(sample_period),
               TimeSeriesLength.WINDOW_1_WEEK       : get_no_of_samples_per_day(sample_period) * 7
              }
    return int(choices.get(dt, 1))
    
def get_no_of_samples_per_min(sample_period):
    """
    It returns the number of samples per minute. This depends also on the predefined sample period.
    Returns:
        An int representing the number of samples.
    """
    return 60 / sample_period


def get_no_of_samples_per_hour(sample_period):
    """
    It returns the number of samples per hour. This depends also on the predefined sample period.
    Returns:
        An int representing the number of samples.
    """
    return get_no_of_samples_per_min(sample_period) * 60


def get_no_of_samples_per_day(sample_period):
    """
    It returns the number of samples per day. This depends also on the predefined sample period.
    Returns:
        An int representing the number of samples.
    """
    return get_no_of_samples_per_hour(sample_period) * 24

### Bucketize data / delay embedding

In [60]:
def bucketize_data(data: np.ndarray, window: int) -> np.ndarray:
    """
    It segments the time series grouping it into batches. Its segment is of size equal to the window.
    Args:
        data (ndarray): The given time series.
        window (int): The size of the segments.
    Returns:
    """
    debug('bucketize_data: Initial shape {}'.format(data.shape))
    n_dims = len(data.shape)

    if n_dims == 1:
        seq_in_batches = np.reshape(data, (int(len(data) / window), window))
    elif n_dims == 2:
        seq_in_batches = np.reshape(data, (int(len(data) / window), window, data.shape[1]))
    else:
        raise Exception('Invalid number of dimensions {}.'.format(n_dims))
    debug('bucketize_data: Shape in batches: {}'.format(seq_in_batches.shape))
    return seq_in_batches

In [61]:
def bucketize_target(target: np.ndarray, window: int) -> np.ndarray:
    """
    Creates target data according to the lenght of the window of the segmented data.
    Args:
        target (ndarray): Target data with the original size.
        window (int): The length of window that will be used to create the corresponding labels.
    Returns:
        The target data for the new bucketized time series.
    """
    target_in_batches = bucketize_data(target, window)
    any_multilabel = np.any(target_in_batches, axis=1)
    debug('bucketize_target: Shape of array in windows: {}'.format(target_in_batches.shape))
    debug('bucketize_target: Shape of array after merging windows: {}'.format(any_multilabel.shape))
    return any_multilabel

In [62]:
def takens_embedding(series: np.ndarray, delay, dimension) -> np.ndarray:
    """
    This function returns the Takens embedding of data with delay into dimension,
    delay*dimension must be < len(data)
    """
    if delay * dimension > len(series):
        info(f'Not enough data for the given delay ({delay}) and dimension ({dimension}).'
             f'\ndelay * dimension > len(data): {delay * dimension} > {len(series)}')
        return series
    delay_embedding = np.array([series[0:len(series) - delay * dimension]])
    for i in range(1, dimension):
        delay_embedding = np.append(delay_embedding,
                                    [series[i * delay:len(series) - delay * (dimension - i)]], axis=0)
    return delay_embedding

In [63]:
def approximate(delay_in_seconds: int, dimension: int, sample_period: int, series_in_segments: np.ndarray, window: int = 1, should_fit: bool = True) -> np.ndarray:
    """
    The time series is given as segments. For each segment we extract the delay embeddings.
    """
    delay_items = int(delay_in_seconds / sample_period)
    window_size = delay_items * dimension

    if window_size > len(series_in_segments[0]):
        raise Exception(
            f'Not enough data for the given delay ({delay_in_seconds} seconds) and dimension ({dimension}).'
            f'\ndelay_items * dimension > len(data): {window_size} > {len(series_in_segments[0])}')

    if window_size == len(series_in_segments[0]):
        info(f"TimeDelayEmbeddingAdapter is applied with delay embeddings equavalent to the length of each segment"
                f" {window_size} == {len(series_in_segments[0])}")

    if window_size < len(series_in_segments[0]):
        info(f"TimeDelayEmbeddingAdapter is applied with delay embeddings covering less than the length of each "
                f"segment. {window_size} < {len(series_in_segments[0])}")

    delay_embeddings = []
    for segment in series_in_segments:
        embedding = takens_embedding(segment, delay_items, dimension)
        delay_embeddings.append(embedding)
    return np.asarray(delay_embeddings)

In [67]:
def reduce_dimensions(data_in_batches: np.ndarray, window: int, sample_period: int,
                      dimension: int, delay_in_seconds: int, should_fit: bool = True):
    """
    It uses the method approximate of the TimeSeriesTransformer in order to achieve dimensionality reduction.
    Args:
        data_in_batches (ndarray): The data of the time series separated in batches.
        window (int): The size of the sub-segments of the given time series.
            This is not supported by all algorithms.
        target (ndarray): The labels that correspond to the given data in batches.
        should_fit (bool): True if it is supported by the algorithm of the specified time series representation.
    Returns:
        The shortened time series as an array (ndarray).
    """
    squeezed_seq = approximate(delay_in_seconds, dimension, sample_period = sample_period, 
                               series_in_segments = data_in_batches, window = window, should_fit = True)

    debug('Shape of squeezed seq: {}'.format(squeezed_seq.shape))
    return squeezed_seq

In [68]:
 def _preprocess(data_df, labels_df, appliances, window_len, sample_period, 
                 dimension, delay_in_seconds, should_fit: bool = True):
    start_time = time.time()
    data = get_site_meter_data(data_df)
    get_features_time = time.time() - start_time
    timing(f"get features time {get_features_time}")

    debug(f"Features \n {data[:10]}")
    target = get_multilabels(labels_df, appliances)
    target = np.array(target.values)
    debug(f"Target \n {target[:10]}")
    window = get_window(window_len, sample_period)
    rem = len(data) % window
    if rem > 0:
        data = data[:-rem]
        target = target[:-rem]
    target = bucketize_target(target, window)
    data = bucketize_data(data, window)
    # if representation_type == TransformerType.raw or representation_type == TransformerType.approximate:
    #     pass
    
    start_time = time.time()
    data = reduce_dimensions(data, window, sample_period, dimension, delay_in_seconds, should_fit)
    reduce_dimensions_time = time.time() - start_time
    timing(f"reduce dimensions time {reduce_dimensions_time}")

    return data, target

In [69]:
def prep_train(train_df, train_labels_df, sample_period, appliances: list, window_len : TimeSeriesLength,
              dimension, delay_in_seconds):
    """
    Train the algorithm for the specified appliances.
    Args:
        appliances (List): List of appliances to be recognized.
        raw_data (bool): True if the experiment uses raw data without any time series representation.
    Returns:
        The preprocess and the fiting time.
    """
    info("Prepossessing before training...")
    start_time = time.time()
    data, target = _preprocess(train_df, train_labels_df, appliances, window_len, sample_period,
                              dimension, delay_in_seconds)
    preprocess_time = time.time() - start_time
    timing(f"preprocess time {preprocess_time}")

    if len(data.shape) == 3:
        data = np.reshape(data, (data.shape[0], data.shape[1] * data.shape[2]))
    
    return data, target
    

In [70]:
def prep_test(test_df, test_labels_df, sample_period, appliances: list, window_len : TimeSeriesLength,
             dimension, delay_in_seconds):
    """
    Runs a test using the specified appliances.
    Args:
        appliances (List): List of appliances to be recognized.
        raw_data (bool): True if the experiment uses raw data without any time series representation.
    Returns:
        A tuple containing macro, micro, a report, preprocess and fiting time.
    """
    
    info("Prepossessing before testing...")
    start_time = time.time()
    data, target = _preprocess(test_df, test_labels_df, appliances, window_len, sample_period,
                              dimension, delay_in_seconds)
    preprocess_time = time.time() - start_time
    timing(f"preprocess time {preprocess_time}")
    if len(data.shape) == 3:
        data = np.reshape(data, (data.shape[0], data.shape[1] * data.shape[2]))
    
    return data, target
    

### Comparison with existing multi-label NILM systems
</br>
According to the literature, the models proposed by <b>Tabatabaei et al. [1]</b> are often used as a strong baseline to evaluate state-of-the-art solutions. In time domain, the method that is used is time delay embeddings. It has two parameters: the time delay and the dimensions of the embeddings. I use the exact same time delay and dimension used in [1], <b>time delay of 32 seconds and dimension of 8</b>. The <b>sampling rate is 6s</b> for all houses in REDD, however, the window isn't specified in their paper.I run my experiment with <b>3 chosen windows: 1 hour, 2 hours, and 4 hours.</b> The authors didn't mention the number of appliances they have used in their experiments, so I use only the appliances mentioned in their paper. The appliances for <b>House 1 are oven, fridge, light, microwave, bath gfi, outlet, washer dryer</b>. The appliances for <b>House 3 are electronics, furnace, washer dryer, microwave, bath gfi, kitchen outlet</b>. 



Besides [1], <b>Nalmpantis et al. [2]</b> was proposed that they had a similar setup to [1] in their comparison. They found that the best time delay is determined to be <b>30 seconds and the best dimension equal to 6. The sampling rate is 6s for all houses in REDD, and the window is set to 5 minutes</b>. The appliances in <b>House 1 are oven, refrigerator, light, microwave, bath GFI and kitchen outlet. For House 3, the appliances are electronics, furnace, washer dryer, microwave, bath GFI and kitchen outlet</b>. I will run another experiment using the exact mentioned parameters and compare the result to [2].


References: </br>
[1] Tabatabaei, S. M., Dick, S., & Xu, W. (2016). Toward non-intrusive load monitoring via multi-label classification. IEEE Transactions on Smart Grid, 8(1), 26-40.

[2] Nalmpantis, C., & Vrakas, D. (2020). On time series representations for multi-label NILM. Neural Computing and Applications, 32, 17275-17290.

### Window

In [71]:
ts_len_5_min = TimeSeriesLength.WINDOW_5_MINS
ts_len_1hr = TimeSeriesLength.WINDOW_1_HOUR
ts_len_2hr = TimeSeriesLength.WINDOW_2_HOURS
ts_len_4hr = TimeSeriesLength.WINDOW_4_HOURS

### Classifiers
#### - MlkNN
#### - Rakel

<b>Note for MlkNN</b> </br>

If you ran into error like I did : TypeError: __init__() takes 1 positional argument but 2 were given
, you can try fix it this way
* Uninstall the current sklearn version and install sklearn = 0.2.4
* Locally by modifying L165 in _compute_cond from </br>
self.knn_ = NearestNeighbors(self.k).fit(X)
* to </br>
self.knn_ = NearestNeighbors(n_neighbors=self.k).fit(X)


In [72]:
from skmultilearn.adapt import MLkNN
from skmultilearn.ensemble import RakelD
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, classification_report

classifier_MlkNN = MLkNN(ignore_first_neighbours=0, k=3, s=1.0)
classifier_Rakel =RakelD(MLPClassifier(hidden_layer_sizes=(100, 100, 100), learning_rate='adaptive',
                                 solver='adam'), labelset_size=5)

#### [1], house 1, 1 hour

In [75]:
sample_period = 6
dimension = 8
delay_in_seconds = 32
train_data, train_target = prep_train(train_df_taba1, train_labels_df_taba1, sample_period, 
                                      appliances_redd1_taba, ts_len_1hr, dimension, delay_in_seconds)
test_data, test_target = prep_test(test_df_taba1, test_labels_df_taba1, sample_period, 
                                   appliances_redd1_taba, ts_len_1hr, dimension, delay_in_seconds)
# MlkNN
classifier_MlkNN.fit(train_data, train_target)
predictions_nn = classifier_MlkNN.predict(test_data)
micro = f1_score(test_target, predictions_nn, average='micro')
macro = f1_score(test_target, predictions_nn, average='macro')
info('MlkNN report')
info('F1 macro {}'.format(macro))
info('F1 micro {}'.format(micro))
report_mlknn_1hr = classification_report(test_target, predictions_nn, target_names=appliances_redd1_taba, output_dict=True)
print(report_mlknn_1hr)

#RakelD
classifier_Rakel.fit(train_data, train_target)
predictions_rak = classifier_Rakel.predict(test_data)
micro = f1_score(test_target, predictions_rak, average='micro')
macro = f1_score(test_target, predictions_rak, average='macro')
info('RakelD report')
info('F1 macro {}'.format(macro))
info('F1 micro {}'.format(micro))
report_rakeld_1hr = classification_report(test_target, predictions_rak, target_names=appliances_redd1_taba, output_dict=True)
print(report_rakeld_1hr)

INFO: Prepossessing before training...
TIMING: get features time 0.0

DEBUG: Features 
 [225.28334 222.60167 222.67667 222.35    222.58667 224.87833 223.71
 224.68832 224.195   226.35   ]
DEBUG: get_multilabels  labels_df.columns Index(['sockets', 'washer dryer', 'fridge', 'unknown', 'electric oven',
       'light', 'microwave'],
      dtype='object')
DEBUG: get_multilabels  appliances ['electric oven', 'fridge', 'light', 'microwave', 'washer dryer', 'unknown', 'sockets']
DEBUG: Target 
 [[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]]
DEBUG: bucketize_data: Initial shape (411600, 7)
DEBUG: bucketize_data: Shape in batches: (686, 600, 7)
DEBUG: bucketize_target: Shape of array in windows: (686, 600, 7)
DEBUG: bucketize_target: Shape of array after merging windows: (686, 7)
DEBUG: bucketize_data: I

  return np.array(label_sets)


INFO: RakelD report
INFO: F1 macro 0.4435139046129949
INFO: F1 micro 0.8201581027667985
{'electric oven': {'precision': 0.3333333333333333, 'recall': 0.06666666666666667, 'f1-score': 0.1111111111111111, 'support': 15}, 'fridge': {'precision': 0.9683544303797469, 'recall': 0.9935064935064936, 'f1-score': 0.9807692307692308, 'support': 154}, 'light': {'precision': 0.6530612244897959, 'recall': 0.9411764705882353, 'f1-score': 0.7710843373493975, 'support': 102}, 'microwave': {'precision': 0.4, 'recall': 0.045454545454545456, 'f1-score': 0.0816326530612245, 'support': 44}, 'washer dryer': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}, 'unknown': {'precision': 1.0, 'recall': 0.08695652173913043, 'f1-score': 0.16, 'support': 46}, 'sockets': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 159}, 'micro avg': {'precision': 0.8718487394957983, 'recall': 0.7742537313432836, 'f1-score': 0.8201581027667985, 'support': 536}, 'macro avg': {'precision': 0.622106998314

  _warn_prf(average, modifier, msg_start, len(result))


#### [1], house 1, 2 hours

In [76]:
sample_period = 6
dimension = 8
delay_in_seconds = 32
train_data, train_target = prep_train(train_df_taba1, train_labels_df_taba1, sample_period, 
                                      appliances_redd1_taba, ts_len_2hr, dimension, delay_in_seconds)
test_data, test_target = prep_test(test_df_taba1, test_labels_df_taba1, sample_period, 
                                   appliances_redd1_taba, ts_len_2hr, dimension, delay_in_seconds)
# MlkNN
classifier_MlkNN.fit(train_data, train_target)
predictions_nn = classifier_MlkNN.predict(test_data)
micro = f1_score(test_target, predictions_nn, average='micro')
macro = f1_score(test_target, predictions_nn, average='macro')
info('MlkNN report')
info('F1 macro {}'.format(macro))
info('F1 micro {}'.format(micro))
report_mlknn_2hr = classification_report(test_target, predictions_nn, target_names=appliances_redd1_taba, output_dict=True)
print(report_mlknn_2hr)

#RakelD
classifier_Rakel.fit(train_data, train_target)
predictions_rak = classifier_Rakel.predict(test_data)
micro = f1_score(test_target, predictions_rak, average='micro')
macro = f1_score(test_target, predictions_rak, average='macro')
info('RakelD report')
info('F1 macro {}'.format(macro))
info('F1 micro {}'.format(micro))
report_rakeld_2hr = classification_report(test_target, predictions_rak, target_names=appliances_redd1_taba, output_dict=True)
print(report_rakeld_2hr)

INFO: Prepossessing before training...
TIMING: get features time 0.00024700164794921875

DEBUG: Features 
 [225.28334 222.60167 222.67667 222.35    222.58667 224.87833 223.71
 224.68832 224.195   226.35   ]
DEBUG: get_multilabels  labels_df.columns Index(['sockets', 'washer dryer', 'fridge', 'unknown', 'electric oven',
       'light', 'microwave'],
      dtype='object')
DEBUG: get_multilabels  appliances ['electric oven', 'fridge', 'light', 'microwave', 'washer dryer', 'unknown', 'sockets']
DEBUG: Target 
 [[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]]
DEBUG: bucketize_data: Initial shape (411600, 7)
DEBUG: bucketize_data: Shape in batches: (343, 1200, 7)
DEBUG: bucketize_target: Shape of array in windows: (343, 1200, 7)
DEBUG: bucketize_target: Shape of array after merging windows: (343, 7)
DEB

  _warn_prf(average, modifier, msg_start, len(result))
  return np.array(label_sets)


INFO: RakelD report
INFO: F1 macro 0.4030126190527193
INFO: F1 micro 0.7258064516129031
{'electric oven': {'precision': 1.0, 'recall': 0.08333333333333333, 'f1-score': 0.15384615384615385, 'support': 12}, 'fridge': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 79}, 'light': {'precision': 0.7272727272727273, 'recall': 0.2711864406779661, 'f1-score': 0.39506172839506176, 'support': 59}, 'microwave': {'precision': 0.6666666666666666, 'recall': 0.0625, 'f1-score': 0.11428571428571428, 'support': 32}, 'washer dryer': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'unknown': {'precision': 0.6, 'recall': 0.09090909090909091, 'f1-score': 0.15789473684210525, 'support': 33}, 'sockets': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 79}, 'micro avg': {'precision': 0.9523809523809523, 'recall': 0.5863192182410424, 'f1-score': 0.7258064516129031, 'support': 307}, 'macro avg': {'precision': 0.7134199134199134, 'recall': 0.3582755521314843, 'f1-scor

  _warn_prf(average, modifier, msg_start, len(result))


#### [1], house 1, 4 hours

In [77]:
sample_period = 6
dimension = 8
delay_in_seconds = 32
train_data, train_target = prep_train(train_df_taba1, train_labels_df_taba1, sample_period, 
                                      appliances_redd1_taba, ts_len_4hr, dimension, delay_in_seconds)
test_data, test_target = prep_test(test_df_taba1, test_labels_df_taba1, sample_period, 
                                   appliances_redd1_taba, ts_len_4hr, dimension, delay_in_seconds)
# MlkNN
classifier_MlkNN.fit(train_data, train_target)
predictions_nn = classifier_MlkNN.predict(test_data)
micro = f1_score(test_target, predictions_nn, average='micro')
macro = f1_score(test_target, predictions_nn, average='macro')
info('MlkNN report')
info('F1 macro {}'.format(macro))
info('F1 micro {}'.format(micro))
report_mlknn_4hr = classification_report(test_target, predictions_nn, target_names=appliances_redd1_taba, output_dict=True)
print(report_mlknn_4hr)

#RakelD
classifier_Rakel.fit(train_data, train_target)
predictions_rak = classifier_Rakel.predict(test_data)
micro = f1_score(test_target, predictions_rak, average='micro')
macro = f1_score(test_target, predictions_rak, average='macro')
info('RakelD report')
info('F1 macro {}'.format(macro))
info('F1 micro {}'.format(micro))
report_rakeld_4hr = classification_report(test_target, predictions_rak, target_names=appliances_redd1_taba, output_dict=True)
print(report_rakeld_4hr)

INFO: Prepossessing before training...
TIMING: get features time 0.0

DEBUG: Features 
 [225.28334 222.60167 222.67667 222.35    222.58667 224.87833 223.71
 224.68832 224.195   226.35   ]
DEBUG: get_multilabels  labels_df.columns Index(['sockets', 'washer dryer', 'fridge', 'unknown', 'electric oven',
       'light', 'microwave'],
      dtype='object')
DEBUG: get_multilabels  appliances ['electric oven', 'fridge', 'light', 'microwave', 'washer dryer', 'unknown', 'sockets']
DEBUG: Target 
 [[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 1.]]
DEBUG: bucketize_data: Initial shape (410400, 7)
DEBUG: bucketize_data: Shape in batches: (171, 2400, 7)
DEBUG: bucketize_target: Shape of array in windows: (171, 2400, 7)
DEBUG: bucketize_target: Shape of array after merging windows: (171, 7)
DEBUG: bucketize_data:

  _warn_prf(average, modifier, msg_start, len(result))
  return np.array(label_sets)


INFO: RakelD report
INFO: F1 macro 0.410242982823628
INFO: F1 micro 0.6666666666666667
{'electric oven': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 10}, 'fridge': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 39}, 'light': {'precision': 0.8, 'recall': 0.23529411764705882, 'f1-score': 0.3636363636363636, 'support': 34}, 'microwave': {'precision': 1.0, 'recall': 0.14814814814814814, 'f1-score': 0.25806451612903225, 'support': 27}, 'washer dryer': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 9}, 'unknown': {'precision': 1.0, 'recall': 0.14285714285714285, 'f1-score': 0.25, 'support': 28}, 'sockets': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 39}, 'micro avg': {'precision': 0.9791666666666666, 'recall': 0.5053763440860215, 'f1-score': 0.6666666666666667, 'support': 186}, 'macro avg': {'precision': 0.6857142857142857, 'recall': 0.3608999155217643, 'f1-score': 0.410242982823628, 'support': 186}, 'weighted avg': {'precisio

  _warn_prf(average, modifier, msg_start, len(result))
