# This notebook will be the only interaction with all any experiment notebook. From here the function calls to all the preprocessing and clustering notebooks will be done

In [1]:
# Libraries needed
# !pip install nbimporter # uncomment if library is not install

In [1]:
# Existing Notebooks
import nbimporter
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from Preprocessing.preprocessing import hourly_dataset
from Preprocessing.context_extraction import getContext
from Preprocessing.load_cuve_generation import doAggregation

from ClusteringAnalysis.ClusteringValidationMetrics import get_validation_scores
from ClusteringAnalysis.ClusteringAlgorithms import doClustering

# Built-in libraries
import time
from itertools import product
from math import log
import pickle
from datetime import datetime, timedelta # testing
from collections import OrderedDict

# NumPy, SciPy and Pandas
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd

# Scikit-Learn
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import pairwise_distances

# Tslearn
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.utils import to_time_series_dataset
# from tslearn.clustering import silhouette_score

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.gridspec as gridspec
import plotly.graph_objs as go
import seaborn as sns


Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/preprocessing.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/context_extraction.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/load_cuve_generation.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/ClusteringValidationMetrics.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/ClusteringAlgorithms.ipynb


In [8]:
# check files
def checkFiles(datasetName, context, function):
    # if the dataset is the combination, directly load the aggregated dataset for the datasets
    if datasetName == 'BDG-DGS':
        df1_name = 'BDG'
        df2_name = 'DGS'

        df1 = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(df1_name, context, function), index_col=0)
        df2 = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(df2_name, context, function), index_col=0)
    
        df = df1.append(df2)
        df.to_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function))
    else:
        # check if dataset has already being processed before
        exists_df = os.path.isfile('../data/processed/{}_dataset.csv'.format(datasetName))
        if exists_df: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_dataset.csv'.format(datasetName), index_col=0)
            print("Preprocessed dataset already exists, loading it ...")
        else: # if file is missing, produce it
            df = hourly_dataset(datasetName)
            print("Preprocessing dataset ...")

        # check if dataset with context has already being processed before
        exists_context = os.path.isfile('../data/processed/{}_{}_dataset.csv'.format(datasetName, context))
        if exists_context: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_{}_dataset.csv'.format(datasetName, context), index_col=0)
            print("Dataset with {} context already exists, loading it ...".format(context))
        else: # if file is missing, produce it
            df = getContext(datasetName, context)
            print("Generating context dataset ...")

        # check if dataset with function has already being processed before
        exists_function = os.path.isfile('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function))
        if exists_function: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
            print("Dataset with {} context and {} load curve aggregation function already exists, loading it ...".format(context, function))

        else: # if file is missing, produce it
            df = doAggregation(datasetName, context, function)
            print("Generating load curves based on {} ...".format(function))
    
    return df

In [15]:
# Running Experiment 
def runExperiment(datasetName, context, function, algorithm='kshape', 
                  algo_parameter=range(2,11), validation_metrics='all', appendTotalFile=False):
    print("Running Experiment with dataset: {}, context:  {}, function: {}, algorithm: {}".format(datasetName, 
                                                                                                context,
                                                                                                function,
                                                                                                algorithm))
    # check if file exists and load it
    df = checkFiles(datasetName, context, function)
    
    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
    
    scores = [] # list of the scores for each parameter for the selected algorithm
    
    # run selected algorithm with the appropiate parameter
    for k in algo_parameter:
        model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=200)
        scores.append(get_validation_scores(df_scaled, labels))
        print("Running experiment with {} and k = {}".format(algorithm, k))
    
    # name for saving the results
    obj_name = '../data/results/{}_{}_{}_{}_scores'.format(datasetName, context, function, algorithm)
    
    # update the final score dataframe
    scores = pd.DataFrame.from_dict(scores)
    scores.insert(0, 'dataset', '')
    scores['dataset'] = datasetName
    scores.insert(1, 'context', '')
    scores['context'] = context
    scores.insert(2, 'function', '')
    scores['function'] = function
    scores.insert(3, 'algorithm', '')
    scores['algorithm'] = algorithm
    if "k" in algorithm or algorithm == 'hierarchical':
        scores.insert(4, 'parameter k', '')
        scores['parameter k'] = algo_parameter
    
    # approximate to two decimals
    scores = scores.round(2)
    
    # save as python pickle
    f = open(obj_name + '.pkl', 'wb')
    pickle.dump(scores, f)
    f.close
    
    # save as csv
    scores.to_csv('{}.csv'.format(obj_name))
    print("Scores saved in {}.csv\n".format(obj_name)) # individual file
    
    if appendTotalFile:
        with open('../data/results/total_scores.csv', 'a') as f: # append to general file
            scores.to_csv(f, header=False)
    
    return


In [None]:
# Run batch of experiments
def runBatchExperiments(datasetName_list, context_list, function_list, algorithm_list, 
                  algo_parameter=range(2,11), validation_metrics='all'):
    # nested loops for each possible combination of the lists
    for datasetName in datasetName_list: # every dataset
        for context in context_list: # every context
            for function in function_list: # every load aggregation function
                for algorithm in algorithm_list: # every algorithm
                    runExperiment(datasetName, context, function, algorithm, algo_parameter, validation_metrics, True)
                        

In [5]:
# Generating clusters and centroids
def generateClusters(datasetName, context, function, algorithm='kshape', algo_parameter = 5, psu=False):
    # check if file exists and load it
    df = checkFiles(datasetName, context, function)
        
    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
        
    # run selected algorithm with the appropiate parameter
    for k in algo_parameter:
        model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300, plot=True)    
    plt.show()
    return

In [None]:
def generateClustersPSU(datasetName, context, function, algorithm='kshape', algo_parameter = 5):

    # load metadata
    if datasetName == 'BDG':
        df_meta = pd.read_csv('../data/raw/meta_open.csv')
    else:
        return
    
    # check if file exists and load it
    df = checkFiles(datasetName, context, function)
        
    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
         
    # get labels for all buildings
    ground_truth_labels = df_meta[df_meta.iloc[:, 0].isin(df.index.values)]
    ground_truth_labels = ground_truth_labels['primaryspaceusage']
    ground_truth_labels = ground_truth_labels.reset_index(drop=True)
    
    # back to pandas
#     df_scaled = pd.DataFrame(df_scaled)
#     df_scaled['psu'] = ground_truth_labels
            
    ground_truth_list = ['Office', 'Dormitory', 'College Classroom', 'Primary/Secondary Classroom', 
                        'College Laboratory']
    colors = ['r', 'g', 'b', 'k', 'y']
    
    # run selected algorithm with the appropiate parameter
    for k in algo_parameter:
        model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300)
        
        if algorithm == 'kshape':
            # extract parameters
            cluster_centers = []
            y_pred = pd.DataFrame()
            y_pred.loc[:, 0] = [0] * df_scaled.shape[0] # initilize cluster membership
            # for each tuple
            for yi in range(k):
                cluster_centers.append(model[yi][0]) # get cluster centers as first element of tuple
                y_pred.loc[model[yi][1], 0] = yi # update cluster membership
            # make them a list
            y_pred = y_pred.iloc[:, 0].values 
        else:
            y_pred = model.fit_predict(df_scaled) # fit the data and generate the cluster labels
        
        # back to pandas
        df_scaled = pd.DataFrame(df_scaled)
        df_scaled['psu'] = ground_truth_labels

        # plot for each cluster
        fig = plt.figure(figsize=(20, 40))

        for yi in range(k):
            plt.subplot(k, 1, 1 + yi)
            
            # for each time series in current cluster
            for index, building in df_scaled[y_pred == yi].iterrows():       
                if building[-1] == 'Office':
                    idx = 0
                elif building[-1] == 'Dormitory':
                    idx = 1
                elif building[-1] == 'College Classroom':
                    idx = 2
                elif building[-1] == 'Primary/Secondary Classroom':
                    idx = 3
                else:
                    idx = 4

                plt.plot(building[:-1], "-", alpha=0.25, label = building[-1], c = colors[idx])

                plt.xlim(0, 23)
            plt.ylim(-4, 4)
            plt.title("Cluster %d" % (yi + 1), fontsize = 30)
                # take care of repeating label and group them
            handles, labels = plt.gca().get_legend_handles_labels()
            by_label = OrderedDict(zip(labels, handles))
            plt.legend(by_label.values(), by_label.keys(), loc='center left', bbox_to_anchor=(1, 0.55), prop={'size': 20})

        fig.suptitle("Dataset: {}".format(datasetName), fontsize = 35)
                
    plt.show()

In [None]:
def plotSilhouette(datasetName, context, function, algorithm, k):
    plt.ioff() # this way only plt.show() will display figures
    
    df = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
     
    # Create a subplot with 1 row and 2 columns
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (k+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(df.values) + (k + 1) * 10])

    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df.values)
    df_scaled = np.squeeze(df_scaled)
    
    clusterer, cluster_labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    
    silhouette_avg = round(silhouette_score(df_scaled, cluster_labels), 2) # round to two decimals
    print("For k =", k, "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(df_scaled, cluster_labels)

    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize = 30)

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("Silhouette analysis for {} {} using {}".format(context, function, algorithm), fontsize = 30)
    ax1.set_xlabel("Silhouette coefficient values", fontsize = 30)
    ax1.set_ylabel("Cluster label", fontsize = 30)

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.3,-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)

    plt.show()
    
    return

In [None]:
def plotCurvesOneBuilding(datasetName, context, function, resolution='day'):
    dataframe = pd.read_csv('../data/processed/{}_{}_dataset.csv'.format(datasetName, context), parse_dates=True, 
                            infer_datetime_format=True, index_col=0)
    df_load_curves = pd.DataFrame() # dataframe that will hold all load curves

    # resample based on parameter
    if (resolution == 'day'):
        availableSamples = (dataframe.resample('1D').asfreq()).index # get list of timestamps group by day
        delta = 23 # timedelta based on resample
    else:
        print("Please choose a valid resolution")
        exit()

    # iterate through all buildings (column)
    for column in range(len(dataframe.columns)):
        df_sampledReadings = pd.DataFrame() # dataframe to hold new samples for a column
        currentColumn = pd.DataFrame(dataframe.iloc[:, column])
        
        # iterate through each day
        for timestamp in availableSamples:
            # update time limits to the current date
            start = timestamp
            end = timestamp + timedelta(hours=delta)
            # get meter data from only this resolution
            df_reading = currentColumn[(currentColumn.index >= start) & (currentColumn.index <= end)]
            # ignore index since they are unique timestamps
            df_reading.reset_index(drop=True, inplace=True)         
            # append new sample as columns
            df_sampledReadings = pd.concat([df_sampledReadings, df_reading], axis=1)
            
        # make sure sure there are no columns with NaN values
        df_sampledReadings.dropna(axis=1, how='all', inplace=True)
        df_sampledReadings = df_sampledReadings.T # transpose it so it's easier to see and operate
        # up to this point, the matrix above has the shape nxm where is the number of instances and m is the number of readings
    
        # if any NaN prevailed
        df_sampledReadings.fillna(value=0, inplace=True) 

        # calculate load curve based on function
        if function == 'average':
            load_curve = np.mean(df_sampledReadings, axis = 0)

        elif function =='median':
            load_curve = np.median(df_sampledReadings, axis = 0)

        else:
            print("Please choose a valid context")
            exit()

        ###################################################################
#         TODO: coding is for plotting purposes
        plt.figure(figsize=(18,10))
        plt.ylim(0,7)
        x_axis = range(0, len(df_sampledReadings.columns))
        for _, curve in df_sampledReadings.iterrows():
            plt.plot(curve, "k-", alpha=.2)
#         plt.plot(load_curve, "r-")
        plt.title("Load Profiles and red representative curve based on {}".format(function))        
        plt.show()

        plt.figure(figsize=(18,10))

        plt.plot(load_curve, "r-", linewidth=7.0)
        
        plt.ylim(0,7)
        
        ###################################################################

        # turn into one column dataframe for easier manipulation
        load_curve = pd.DataFrame(load_curve)
        # keep the instance name as column name
        instance_name = []
        instance_name.append(df_sampledReadings.index[0])
        load_curve.columns = instance_name
        # append current load curve to dataframe
        df_load_curves = pd.concat([df_load_curves, load_curve], axis=1)
        
        # end of for loop for one column
    return

In [None]:
def plotSC():
    df = pd.read_csv('../data/results/total_scores.csv', index_col=0)
    
    datasets = ['BDG', 'DGS', 'BDG-DGS']
    algorithms = ['kshape', 'kmeans', 'hierarchical']
    contexts = ['weekday', 'weekend', 'fullweek']
    function = ['average', 'median']
    
    # Four axes, returned as a 2-d array
    f, axarr = plt.subplots(3, 3, figsize=(20,18))
    f.subplots_adjust(hspace=0.05, wspace=0.05)
    
    for i in range(3): # rows
        df_row = df[df.iloc[:, 0] == datasets[i]]
#         print(i)
#         print(df_row.iloc[0, 0])
        for j in range(3): # columns, different algorithm
            
            df_experiment = df_row.copy()
            df_experiment = df_experiment[df_experiment.iloc[:, 3] == algorithms[j]]
            df_experiment_context = df_experiment.iloc[:, 0:4]
            df_experiment_context.drop_duplicates(inplace = True) 
#             print(j)
#             print(df_experiment.iloc[0, 3])
            # one plot for each context
            for _, context in df_experiment_context.iterrows():
                current_plot = df_experiment[(df_experiment.iloc[:, 1] == context[1]) &
                                            (df_experiment.iloc[:, 2] == context[2])]
#                 print(current_plot)
                axarr[i, j].set_ylim([-0.05, 0.6])
                x_values = current_plot.iloc[:, 4]
                y_values = current_plot['silhouette_score']
                context_name = str(context[1]) + " " +  str(context[2])
                axarr[i, j].plot(x_values, y_values, label=context_name)
                axarr[i, j].xaxis.grid(True)
                axarr[i, j].yaxis.grid(True)

    # Fine-tune figure; hide x ticks for top plots and y ticks for right plots
    cols = ['K-Shape', 'K-Means', 'Hierarchical']
    for ax, col in zip(axarr[0], cols):
        ax.set_title(col, fontsize = 30)

    for ax, row in zip(axarr[:,0], datasets):
        ax.set_ylabel(row, rotation=90, fontsize=30)
    
    f.text(0.5, 0.08, 'K', ha='center', fontsize=30)
    
    plt.legend(loc='center left', bbox_to_anchor=(1, 1.55), prop={'size': 20})
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)

    plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False)
    plt.setp([a.get_xticklabels() for a in axarr[1, :]], visible=False)
    plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False)
    plt.setp([a.get_yticklabels() for a in axarr[:, 2]], visible=False)
    plt.show()
    f.savefig("../data/plots/silhouette_all_plots.png")


In [2]:
def plotGroundTruthDist(datasetName, context, function, algorithm, k):
    # load metadata
    if datasetName == 'BDG':
        df_meta = pd.read_csv('../data/raw/meta_open.csv')
    else:
        return
    # load data
    df_data = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)

    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df_data.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
    
    # run algorithm
    model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300)
    # get number of elements in each cluster
    data_dict = {i: np.where(labels == i)[0] for i in range(k)}

    df_dist = pd.DataFrame()
    
    ground_truth_list = ['Office', 'Dormitory', 'College Classroom', 'Primary/Secondary Classroom', 
                        'College Laboratory']

    for key, value in data_dict.items():
        
        # BDG
        bdg_ids = df_data.iloc[value.tolist()] # retrieve original building name based on index
        # resample based on building id
        ground_truth_labels = df_meta[df_meta.iloc[:, 0].isin(bdg_ids.index.values)]

        # count the times the different ground truth labels exist in each cluster
        aux = ground_truth_labels['primaryspaceusage'].value_counts()
        # if a particular PSU didnt exist in the cluster
        for psu in ground_truth_list:
            if psu not in aux.index.values :
                aux = aux.set_value(psu, 0)
        df_dist[key] = aux

    df_dist.index.name = 'PSU'
    
    # plot stacked bar
    ax = df_dist.T.plot.barh(stacked=True, figsize=(20, 18), mark_right = True) # original without text

    plt.legend(loc='best', prop={'size': 30})
    
    # show membership percentages
    df_total = df_dist.T.copy()
    df_total['total'] = df_total.sum(axis=1)
    df_total = df_total['total']
    df = df_dist.T.copy()
    df_rel = df.div(df_total, 0)*100
    
    for n in df_rel:
        for i, (cs, ab, pc, tot) in enumerate(zip(df.iloc[:, :].cumsum(1)[n], df[n], df[n], df_total)):
            plt.text(tot, i, str(int(tot)), va='center', fontsize = 30)
            if np.isnan(pc):
                continue
            else:
                plt.text(cs - ab/2, i, str(int(pc)), va='center', ha='center', fontsize = 20)
    
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    ax.xaxis.grid(True)
    plt.show()
    return

In [None]:
def getLabelHist(datasetName, context, function):
    df_meta = pd.read_csv('../data/raw/dgs_metadata.csv') # DGS
#     df_meta = pd.read_csv('../data/raw/meta_open.csv') # BDG

    df = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
    
    # DGS
    df_aux = pd.read_csv("../data/raw/DGS_322_Buildings-15m-By_Building-DST-gap-filled-3-2-18-508pm.csv")
    df_aux = df_aux.T
    df_hist = df_aux[df_aux.iloc[:, 0].isin(df.index.values)] # get id based on names
    df_hist = df_meta[df_meta['id'].isin(df_hist.index.values)] # get label based on id
    
    # BDG
#     df_hist = df_meta[df_meta.iloc[:, 0].isin(df.index.values)]

    
    hist = go.FigureWidget(
        data=[
            dict(
                type='histogram',
                x=df_hist['espm_type_name'], #espm_type_name - primaryspaceusage

            )
        ]
    )
    

    hist.layout.title = '{} PSU Histogram'.format(datasetName)
    hist.layout.titlefont.size = 30
    hist.layout.xaxis.tickfont.size = 20
    hist.layout.xaxis.tickangle = 90
    hist.layout.yaxis.tickfont.size = 20

#     hist.layout.margin.b = 330 # BDG
    hist.layout.margin.b = 550 # DGS
    
    hist.layout.height=1000 # DGS
#     hist.layout.height=1000 # DGS
    
    return hist

In [3]:
"""
As of now, only works for BDG, needs tweaking for DGS and BDG-DGS
"""
def getMemberships(datasetName, context, function, algorithm, algo_parameter=range(2,11), individual=False):
    # load metadata
    if datasetName == 'BDG':
        df_meta = pd.read_csv('../data/raw/meta_open.csv')#, index_col=0)
        # final df for all the information
        df_memberships = pd.DataFrame(columns=['dataset','context','function','algorithm','parameter k', 
                                               'clusterNum', 'clusterLabel', 'Office', 'Dormitory', 
                                               'College Classroom', 'Primary/Secondary Classroom', 
                                               'College Laboratory', 'CorrectLabel', 'IncorrectLabel']) 
        ground_truth_list = ['Office', 'Dormitory', 'College Classroom', 'Primary/Secondary Classroom', 
                            'College Laboratory']
        
    elif datasetName == 'DGS':
        df_meta = pd.read_csv('../data/raw/dgs_metadata.csv')
        df_aux = pd.read_csv("../data/raw/DGS_322_Buildings-15m-By_Building-DST-gap-filled-3-2-18-508pm.csv")
        df_aux = df_aux.T
        df_memberships = pd.DataFrame(columns=['dataset','context','function','algorithm','parameter k', 
                                               'clusterNum', 'clusterLabel', 
                                               'Other/Specialty Hospital', 
                                               'Worship Facility', 
                                               'Senior Care Community',
                                               'Other',
                                               'Urgent Care Center/Clinic/Other Outpatient Office', 
                                               'Other - Technology/Science', 
                                               'Food Sales',
                                               'Other - Entertainment/Public Assembly',
                                               'Non-Refrigerated Warehouse',
                                               'Multifamily Housing',
                                               'Other - Lodging/Residential',
                                               'Swimming Pool',
                                               'Other - Education',
                                               'Parking',
                                               'Fire Station',
                                               'Library',
                                               'K-12 School',
                                               'Office',
                                               'Social/Meeting Hall',
                                               'Other - Recreation',
                                               'Other - Public Services',
                                               'Police Station',
                                               'CorrectLabel',
                                               'IncorrectLabel'])
        
        ground_truth_list = ['Other/Specialty Hospital', 
                             'Worship Facility',
                             'Senior Care Community',
                             'Other',
                             'Urgent Care Center/Clinic/Other Outpatient Office', 
                             'Other - Technology/Science', 
                             'Food Sales',
                             'Other - Entertainment/Public Assembly',
                             'Non-Refrigerated Warehouse',
                             'Multifamily Housing',
                             'Other - Lodging/Residential',
                             'Swimming Pool',
                             'Other - Education',
                             'Parking',
                             'Fire Station',
                             'Library',
                             'K-12 School',
                             'Office',
                             'Social/Meeting Hall',
                             'Other - Recreation',
                             'Other - Public Services',
                             'Police Station']
    
    elif datasetName == 'BDG-DGS':
        df_meta_bdg = pd.read_csv('../data/raw/meta_open.csv')        
        df_meta_dgs = pd.read_csv('../data/raw/dgs_metadata.csv')
        df_aux = pd.read_csv("../data/raw/DGS_322_Buildings-15m-By_Building-DST-gap-filled-3-2-18-508pm.csv")
        df_aux = df_aux.T
        # final df for all information
        df_memberships = pd.DataFrame(columns=['dataset','context','function','algorithm','parameter k', 
                                               'clusterNum', 'clusterLabel',
                                               'Office', 'Dormitory', 'College Classroom', 
                                               'Primary/Secondary Classroom', 'College Laboratory', # end of BDG
                                               'Other/Specialty Hospital', 
                                               'Worship Facility', 
                                               'Senior Care Community',
                                               'Other',
                                               'Urgent Care Center/Clinic/Other Outpatient Office', 
                                               'Other - Technology/Science', 
                                               'Food Sales',
                                               'Other - Entertainment/Public Assembly',
                                               'Non-Refrigerated Warehouse',
                                               'Multifamily Housing',
                                               'Other - Lodging/Residential',
                                               'Swimming Pool',
                                               'Other - Education',
                                               'Parking',
                                               'Fire Station',
                                               'Library',
#                                                'K-12 School', # this was replace by 'Primary/Secondary Classroom'
#                                                'Office', # repeated above
                                               'Social/Meeting Hall',
                                               'Other - Recreation',
                                               'Other - Public Services',
                                               'Police Station',
                                               'CorrectLabel',
                                               'IncorrectLabel'])

        ground_truth_list = ['Office', 'Dormitory', 'College Classroom', 'Primary/Secondary Classroom', 
                             'College Laboratory', # end of BDG
                             'Other/Specialty Hospital', 
                             'Worship Facility',
                             'Senior Care Community',
                             'Other',
                             'Urgent Care Center/Clinic/Other Outpatient Office', 
                             'Other - Technology/Science', 
                             'Food Sales',
                             'Other - Entertainment/Public Assembly',
                             'Non-Refrigerated Warehouse',
                             'Multifamily Housing',
                             'Other - Lodging/Residential',
                             'Swimming Pool',
                             'Other - Education',
                             'Parking',
                             'Fire Station',
                             'Library',
#                              'K-12 School', # this was replace by 'Primary/Secondary Classroom'
#                              'Office', # repeated above
                             'Social/Meeting Hall',
                             'Other - Recreation',
                             'Other - Public Services',
                             'Police Station']

    # load data
    df_data = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), 
                          index_col=0)

    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df_data.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
    
    # for all k's
    for k in algo_parameter:
        # run algorithm
        model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300)
        
        # get number of elements in each cluster
        data_dict = {i: np.where(labels == i)[0] for i in range(k)} # will have the indeces of df_data
        df_dist = pd.DataFrame()

        # get the ground truth label for buildings in each cluster
        for key, value in data_dict.items():
            # get correct names of bdgs
            bdg_ids = df_data.iloc[value.tolist()] # retrieve original building name based on index

            if datasetName == 'BDG':
                # resample based on building id for the psu labels
                ground_truth_labels = df_meta[df_meta.iloc[:, 0].isin(bdg_ids.index.values)]
                # count the times the different ground truth labels exist in each cluster
                aux = ground_truth_labels['primaryspaceusage'].value_counts()
                
            elif datasetName == 'DGS':
                # resample based on building id for the psu labels
                df_hist = df_aux[df_aux.iloc[:, 0].isin(bdg_ids.index.values)] # get id based on names
                df_hist = df_meta[df_meta['id'].isin(df_hist.index.values)] # get label based on id
                aux = df_hist['espm_type_name'].value_counts()             
                
            elif datasetName == 'BDG-DGS':
                # resample based on building id for the psu labels (BDG)
                ground_truth_labels = df_meta_bdg[df_meta_bdg.iloc[:, 0].isin(bdg_ids.index.values)]
                # count the times the different ground truth labels exist in each cluster
                aux = ground_truth_labels['primaryspaceusage']#.value_counts()
                
                # resample based on building id for the psu labels (DGS)
                df_hist = df_aux[df_aux.iloc[:, 0].isin(bdg_ids.index.values)] # get id based on names
                df_hist = df_meta_dgs[df_meta_dgs['id'].isin(df_hist.index.values)] # get label based on id
                
                # merge both set of labels
                aux = aux.append(df_hist['espm_type_name'], ignore_index=True)#.value_counts()
                # replace K-12 to Primary/Secondary Classroom
                aux = aux.replace(to_replace='K-12 School', value='Primary/Secondary Classroom')
                # as a final step, produce the counts
                aux = aux.value_counts()
        
            # if a particular PSU didnt exist in the cluster
            for psu in ground_truth_list:
                if psu not in aux.index.values :
                    aux = aux.set_value(psu, 0)
            df_dist[key] = aux
            
            currentCluster = df_dist.T.tail(1) # so it's easier to treat as columns and only the current cluster
            clusterNum = currentCluster.index.values[0] # label assigned by cluster algorithm,

            # based on our assumption, label matches the PSU with highest count
            clusterLabel = currentCluster.idxmax(axis=1).iloc[0]

            # the number of correct labels is the number of buildings whose PSU is the cluster label
            correctLabel = currentCluster[clusterLabel]
            # the incorrectly clusters are the remaining buildings
            incorrectLabel = currentCluster.copy().drop(clusterLabel, axis=1).sum(axis=1)
            
            # append everything to total dataframe
            if datasetName == 'BDG':
                df_memberships = df_memberships.append({'dataset': datasetName,
                                    'context': context,
                                    'function': function,
                                    'algorithm': algorithm,
                                    'parameter k': k,
                                    'clusterNum': clusterNum, 
                                    'clusterLabel': clusterLabel, 
                                    'Office': currentCluster['Office'].iloc[0], 
                                    'Dormitory': currentCluster['Dormitory'].iloc[0],
                                    'College Classroom': currentCluster['College Classroom'].iloc[0], 
                                    'Primary/Secondary Classroom': currentCluster['Primary/Secondary Classroom'].iloc[0], 
                                    'College Laboratory': currentCluster['College Laboratory'].iloc[0],
                                    'CorrectLabel': correctLabel.iloc[0],
                                    'IncorrectLabel': incorrectLabel.iloc[0]
                                    }, ignore_index=True)
            elif datasetName == 'DGS':
                 df_memberships = df_memberships.append({'dataset': datasetName,
                                    'context': context,
                                    'function': function,
                                    'algorithm': algorithm,
                                    'parameter k': k,
                                    'clusterNum': clusterNum, 
                                    'clusterLabel': clusterLabel, 
                                    'Other/Specialty Hospital': currentCluster['Other/Specialty Hospital'].iloc[0], 
                                    'Worship Facility': currentCluster['Worship Facility'].iloc[0], 
                                    'Senior Care Community': currentCluster['Senior Care Community'].iloc[0],
                                    'Other': currentCluster['Other'].iloc[0],
                                    'Urgent Care Center/Clinic/Other Outpatient Office': currentCluster['Urgent Care Center/Clinic/Other Outpatient Office'].iloc[0], 
                                    'Other - Technology/Science': currentCluster['Other - Technology/Science'].iloc[0], 
                                    'Food Sales': currentCluster['Food Sales'].iloc[0],
                                    'Other - Entertainment/Public Assembly': currentCluster['Other - Entertainment/Public Assembly'].iloc[0],
                                    'Non-Refrigerated Warehouse': currentCluster['Non-Refrigerated Warehouse'].iloc[0],
                                    'Multifamily Housing': currentCluster['Multifamily Housing'].iloc[0],
                                    'Other - Lodging/Residential': currentCluster['Other - Lodging/Residential'].iloc[0],
                                    'Swimming Pool': currentCluster['Swimming Pool'].iloc[0],
                                    'Other - Education': currentCluster['Other - Education'].iloc[0],
                                    'Parking': currentCluster['Parking'].iloc[0],
                                    'Fire Station': currentCluster['Fire Station'].iloc[0],
                                    'Library': currentCluster['Library'].iloc[0],
                                    'K-12 School': currentCluster['K-12 School'].iloc[0],
                                    'Office': currentCluster['Office'].iloc[0],
                                    'Social/Meeting Hall': currentCluster['Social/Meeting Hall'].iloc[0],
                                    'Other - Recreation': currentCluster['Other - Recreation'].iloc[0],
                                    'Other - Public Services': currentCluster['Other - Public Services'].iloc[0],
                                    'Police Station': currentCluster['Police Station'].iloc[0],
                                    'CorrectLabel': correctLabel.iloc[0],
                                    'IncorrectLabel': incorrectLabel.iloc[0]
                                    }, ignore_index=True)
                    
            elif datasetName == 'BDG-DGS':
                df_memberships = df_memberships.append({'dataset': datasetName,
                                    'context': context,
                                    'function': function,
                                    'algorithm': algorithm,
                                    'parameter k': k,
                                    'clusterNum': clusterNum, 
                                    'clusterLabel': clusterLabel, 
                                    'Office': currentCluster['Office'].iloc[0], 
                                    'Dormitory': currentCluster['Dormitory'].iloc[0],
                                    'College Classroom': currentCluster['College Classroom'].iloc[0], 
                                    'Primary/Secondary Classroom': currentCluster['Primary/Secondary Classroom'].iloc[0], 
                                    'College Laboratory': currentCluster['College Laboratory'].iloc[0],
                                    'Other/Specialty Hospital': currentCluster['Other/Specialty Hospital'].iloc[0], 
                                    'Worship Facility': currentCluster['Worship Facility'].iloc[0], 
                                    'Senior Care Community': currentCluster['Senior Care Community'].iloc[0],
                                    'Other': currentCluster['Other'].iloc[0],
                                    'Urgent Care Center/Clinic/Other Outpatient Office': currentCluster['Urgent Care Center/Clinic/Other Outpatient Office'].iloc[0], 
                                    'Other - Technology/Science': currentCluster['Other - Technology/Science'].iloc[0], 
                                    'Food Sales': currentCluster['Food Sales'].iloc[0],
                                    'Other - Entertainment/Public Assembly': currentCluster['Other - Entertainment/Public Assembly'].iloc[0],
                                    'Non-Refrigerated Warehouse': currentCluster['Non-Refrigerated Warehouse'].iloc[0],
                                    'Multifamily Housing': currentCluster['Multifamily Housing'].iloc[0],
                                    'Other - Lodging/Residential': currentCluster['Other - Lodging/Residential'].iloc[0],
                                    'Swimming Pool': currentCluster['Swimming Pool'].iloc[0],
                                    'Other - Education': currentCluster['Other - Education'].iloc[0],
                                    'Parking': currentCluster['Parking'].iloc[0],
                                    'Fire Station': currentCluster['Fire Station'].iloc[0],
                                    'Library': currentCluster['Library'].iloc[0],
#                                     'K-12 School': currentCluster['K-12 School'].iloc[0],
#                                     'Office': currentCluster['Office'].iloc[0],
                                    'Social/Meeting Hall': currentCluster['Social/Meeting Hall'].iloc[0],
                                    'Other - Recreation': currentCluster['Other - Recreation'].iloc[0],
                                    'Other - Public Services': currentCluster['Other - Public Services'].iloc[0],
                                    'Police Station': currentCluster['Police Station'].iloc[0],
                                    'CorrectLabel': correctLabel.iloc[0],
                                    'IncorrectLabel': incorrectLabel.iloc[0]
                                    }, ignore_index=True)
              
    # save as csv
    if individual:
        df_memberships.to_csv('../data/results/total_memerbships_{}_{}_{}_{}.csv'.format(datasetName,
                                                                                        context,
                                                                                        function,
                                                                                        algorithm))
    else:
        # append to general file
        with open('../data/results/total_memberships_{}.csv'.format(datasetName), 'a') as f:
            df_memberships.to_csv(f, header=False)
            
    return

In [5]:
def getBatchMemberships(datasetName_list, context_list, function_list, algorithm_list, algo_parameter=range(2,11)):
     # nested loops for each possible combination of the lists
    for datasetName in datasetName_list: # every dataset
        for context in context_list: # every context
            for function in function_list: # every load aggregation function
                for algorithm in algorithm_list: # every algorithm
                    getMemberships(datasetName, context, function, algorithm, algo_parameter=range(2,11))


In [None]:
def generateFinalTable(dataset, k):
    # list values for context
    contexts = ['weekday', 'weekend', 'fullweek']
    functions = ['average', 'median']
    algorithms = ['kshape', 'kmeans', 'hierarchical']
    
    # load results
    df = pd.read_csv("../data/results/total_memberships_{}.csv".format(dataset), index_col=0)

    # since each processed dataset has the same buildings but with different hourly readings, we can load any of them
    df_pro = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(dataset, contexts[0], functions[0]), index_col=0)
    
    # load total counts of buildings for each psu
    if dataset == 'BDG':
        df_meta = pd.read_csv('../data/raw/meta_open.csv')
        df_psu_labels = df_meta[df_meta.iloc[:, 0].isin(df_pro.index.values)]
        label_dist = df_psu_labels['primaryspaceusage'].value_counts()
        df_final = pd.DataFrame(index = ['Office', 'Dormitory', 'College Classroom', 
                                        'Primary/Secondary Classroom', 'College Laboratory', 'Total'], 
                                columns=['Correctly Labeled', 'Incorrectly Labeled'])
    elif dataset =='DGS':
        df_meta = pd.read_csv('../data/raw/dgs_metadata.csv')
        df_aux = pd.read_csv("../data/raw/DGS_322_Buildings-15m-By_Building-DST-gap-filled-3-2-18-508pm.csv")
        df_aux = df_aux.T
        df_psu_labels = df_aux[df_aux.iloc[:, 0].isin(df_pro.index.values)] # get id based on names
        df_psu_labels = df_meta[df_meta['id'].isin(df_psu_labels.index.values)] # get label based on id
        label_dist = df_psu_labels['espm_type_name'].value_counts()
        df_final = pd.DataFrame(index = ['Other/Specialty Hospital', 
                                         'Worship Facility',
                                         'Senior Care Community',
                                         'Other',
                                         'Urgent Care Center/Clinic/Other Outpatient Office', 
                                         'Other - Technology/Science', 
                                         'Food Sales',
                                         'Other - Entertainment/Public Assembly',
                                         'Non-Refrigerated Warehouse',
                                         'Multifamily Housing',
                                         'Other - Lodging/Residential',
                                         'Swimming Pool',
                                         'Other - Education',
                                         'Parking',
                                         'Fire Station',
                                         'Library',
                                         'K-12 School',
                                         'Office',
                                         'Social/Meeting Hall',
                                         'Other - Recreation',
                                         'Other - Public Services',
                                         'Police Station',
                                         'Total'],
                                columns=['Correctly Labeled', 'Incorrectly Labeled'])
    elif dataset == 'BDG-DGS':
        # BDG
        df_meta_bdg = pd.read_csv('../data/raw/meta_open.csv')
        df_psu_labels = df_meta_bdg[df_meta_bdg.iloc[:, 0].isin(df_pro.index.values)]
        label_dist = df_psu_labels['primaryspaceusage']#.value_counts()
        # DGS
        df_meta_dgs = pd.read_csv('../data/raw/dgs_metadata.csv')
        df_aux = pd.read_csv("../data/raw/DGS_322_Buildings-15m-By_Building-DST-gap-filled-3-2-18-508pm.csv")
        df_aux = df_aux.T
        df_psu_labels = df_aux[df_aux.iloc[:, 0].isin(df_pro.index.values)] # get id based on names
        df_psu_labels = df_meta_dgs[df_meta_dgs['id'].isin(df_psu_labels.index.values)] # get label based on id
        # merge both set of labels
        label_dist = label_dist.append(df_psu_labels['espm_type_name'], ignore_index=True)#.value_counts()
        # replace K-12 to Primary/Secondary Classroom
        label_dist = label_dist.replace(to_replace='K-12 School', value='Primary/Secondary Classroom')
        # as a final step, produce the counts
        label_dist = label_dist.value_counts()
        
        df_final = pd.DataFrame(index = ['Office', 'Dormitory', 'College Classroom', 
                                               'Primary/Secondary Classroom', 'College Laboratory', # end of BDG
                                               'Other/Specialty Hospital', 
                                               'Worship Facility', 
                                               'Senior Care Community',
                                               'Other',
                                               'Urgent Care Center/Clinic/Other Outpatient Office', 
                                               'Other - Technology/Science', 
                                               'Food Sales',
                                               'Other - Entertainment/Public Assembly',
                                               'Non-Refrigerated Warehouse',
                                               'Multifamily Housing',
                                               'Other - Lodging/Residential',
                                               'Swimming Pool',
                                               'Other - Education',
                                               'Parking',
                                               'Fire Station',
                                               'Library',
#                                                'K-12 School', # this was replace by 'Primary/Secondary Classroom'
#                                                'Office', # repeated above
                                               'Social/Meeting Hall',
                                               'Other - Recreation',
                                               'Other - Public Services',
                                               'Police Station',
                                               'Total'],
                                columns=['Correctly Labeled', 'Incorrectly Labeled'])
    
    # iterate through all elements of the lists
    for context in contexts:
        for function in functions:
            for algorithm in algorithms:
                contextName = "{}-{}-{}".format(context, function, algorithm)
                print(contextName)
                
                if dataset == 'BDG':
                    current_context = pd.DataFrame(index = ['Office', 'Dormitory', 'College Classroom', 
                                                            'Primary/Secondary Classroom',
                                                            'College Laboratory', 'Total'], 
                                                   columns=['Correctly Labeled', 'Incorrectly Labeled'])
                elif dataset == 'DGS':
                    current_context = pd.DataFrame(index = ['Other/Specialty Hospital', 
                                                            'Worship Facility',
                                                            'Senior Care Community',
                                                            'Other',
                                                            'Urgent Care Center/Clinic/Other Outpatient Office', 
                                                            'Other - Technology/Science', 
                                                            'Food Sales',
                                                            'Other - Entertainment/Public Assembly',
                                                            'Non-Refrigerated Warehouse',
                                                            'Multifamily Housing',
                                                            'Other - Lodging/Residential',
                                                            'Swimming Pool',
                                                            'Other - Education',
                                                            'Parking',
                                                            'Fire Station',
                                                            'Library',
                                                            'K-12 School',
                                                            'Office',
                                                            'Social/Meeting Hall',
                                                            'Other - Recreation',
                                                            'Other - Public Services',
                                                            'Police Station',
                                                            'Total'],
                                                   columns=['Correctly Labeled', 'Incorrectly Labeled'])
                elif dataset == 'BDG-DGS':
                    current_context = pd.DataFrame(index = ['Office', 'Dormitory', 'College Classroom', 
                                                            'Primary/Secondary Classroom', 
                                                            'College Laboratory', # end of BDG
                                                            'Other/Specialty Hospital', 
                                                            'Worship Facility', 
                                                            'Senior Care Community',
                                                            'Other',
                                                            'Urgent Care Center/Clinic/Other Outpatient Office', 
                                                            'Other - Technology/Science', 
                                                            'Food Sales',
                                                            'Other - Entertainment/Public Assembly',
                                                            'Non-Refrigerated Warehouse',
                                                            'Multifamily Housing',
                                                            'Other - Lodging/Residential',
                                                            'Swimming Pool',
                                                            'Other - Education',
                                                            'Parking',
                                                            'Fire Station',
                                                            'Library',
#                                                             'K-12 School', # this was replace by 'Primary/Secondary Classroom'
#                                                             'Office', # repeated above
                                                            'Social/Meeting Hall',
                                                            'Other - Recreation',
                                                            'Other - Public Services',
                                                            'Police Station',
                                                            'Total'],
                                                   columns=['Correctly Labeled', 'Incorrectly Labeled'])
                total_sum_correct = 0
                total_sum_incorrect = 0 
                # get the subset dataset which will be the first two columns
                df_resampled = df[(df['context'] == context) & 
                                  (df['function'] == function) &
                                  (df['algorithm'] == algorithm) &
                                  (df['parameter k'] == k)]

                # iterate through PSU labels
                for psu in current_context.index.values:
                    if psu == 'Total':
                        continue
                    # correct labels counts happe when the cluster label is the current psu
                    correctLabel = df_resampled[df_resampled['clusterLabel'] == psu] # all rows where that happens
                    if correctLabel.empty:
                        correctLabel = 0
                        correctLabel_count = 0
                        total_sum_correct += correctLabel_count
                    else:
                        correctLabel_count = correctLabel['CorrectLabel'].sum()
                        total_sum_correct += correctLabel_count
                        correctLabel = (correctLabel_count / label_dist[psu].sum() * 100).round(2)
                            
                    # incorrect labels counts is total count of this psu bdgs in the dataset - correctLabels
                    incorrectLabel_count = label_dist.loc[psu] - correctLabel_count
                    total_sum_incorrect += incorrectLabel_count
                    incorrectLabel = (incorrectLabel_count / label_dist[psu].sum() * 100).round(2)
                        
                    current_context.loc[psu] = [correctLabel, incorrectLabel]                        
                        
                aux_total_sum_correct = total_sum_correct
                total_sum_correct = (total_sum_correct / (total_sum_correct + total_sum_incorrect) * 100).round(2)
                total_sum_incorrect = (total_sum_incorrect / (aux_total_sum_correct + total_sum_incorrect) * 100).round(2)
                            
                current_context.loc[psu] = [total_sum_correct, total_sum_incorrect]
                # append current context to final table
                df_final = pd.concat([df_final, current_context], axis=1)
                        
    # save to file
    df_final.to_csv("../data/results/finaltable_{}_{}.csv".format(dataset, k))
    return
    

In [1]:
def finalBarPlots(dataset, k, stacked=False, grouped='algorithm'):
    df = pd.read_csv("../data/results/finaltable_{}_{}.csv".format(dataset, k), index_col=0)
    
    contexts = ['weekday-average-kshape', 'weekday-average-kmeans', 'weekday-average-hierarchical',
                'weekday-median-kshape', 'weekday-median-kmeans', 'weekday-median-hierarchical', 
                'weekend-average-kshape', 'weekend-average-kmeans', 'weekend-average-hierarchical',
                'weekend-median-kshape', 'weekend-median-kmeans', 'weekend-median-hierarchical',
                'fullweek-average-kshape', 'fullweek-average-kmeans', 'fullweek-average-hierarchical',
                'fullweek-median-kshape', 'fullweek-median-kmeans', 'fullweek-median-hierarchical']

    context_no_algo = ['weekday-average', 'weekday-median', 
                       'weekend-average','weekend-median',
                       'fullweek-average','fullweek-median']
    
    algorithms = ['kshape', 'kmeans', 'hierarchical']
    
    # pre-process the dataframe
    df_transpose = pd.DataFrame(df.T.reset_index())
    df_transpose['index'] = df_transpose['index'].apply(lambda x: x.split('.')[0])
    df_transpose.drop(df_transpose.index[[0, 1]], inplace=True)
    
    # insert context in one columm, each contex has two row values for correctly and incorrectly
    df_aux = []
    df_aux_single = []
    for c in contexts:
        df_aux.append(c)
        df_aux.append(c)  
        df_aux_single.append(c)
    df_transpose.insert(0, 'contexts', '')
    df_transpose['contexts'] = df_aux
    
    if stacked:
        H = "/"
        dfall = []
    
        # change grouping accordingly
        if grouped == 'algorithm':
            labels = context_no_algo
            # generate sub dataframes based on algorithms
            for context_algo in context_no_algo:
                df_subset = pd.DataFrame(columns=['Correctly', 'Incorrectly'], index=algorithms)
                df_context_sub = df_transpose[df_transpose['contexts'].str.contains(context_algo)]
                for algo in algorithms:
                    value = df_context_sub[df_context_sub['contexts'].str.contains(algo)]
                    value = value['Total'].reset_index(drop=True)
                    df_subset.loc[algo] = [value.iloc[0], value.iloc[1]]

                dfall.append(df_subset)
        elif grouped == 'context':
            labels = algorithms
            # generate sub dataframes based on contexts
            for algo in algorithms:
                df_subset = pd.DataFrame(columns=['Correctly', 'Incorrectly'], index=context_no_algo)
                df_context_sub = df_transpose[df_transpose['contexts'].str.contains(algo)]
                for context_algo in context_no_algo:
                    value = df_context_sub[df_context_sub['contexts'].str.contains(context_algo)]
                    value = value['Total'].reset_index(drop=True)
                    df_subset.loc[context_algo] = [value.iloc[0], value.iloc[1]]

                dfall.append(df_subset)
        
        n_df = len(dfall)
        n_col = len(dfall[0].columns)
        n_ind = len(dfall[0].index)

        plt.figure(figsize=(16,14))
        axe = plt.subplot(111)
        
        for df in dfall : # for each data frame
            axe = df.plot(kind="bar",
                          linewidth=0,
                          stacked=True,
                          ax=axe,
                          legend=False,
                          grid=False)  # make bar plots

        h,l = axe.get_legend_handles_labels() # get the handles we want to modify
        for i in range(0, n_df * n_col, n_col): # len(h) = n_col * n_df
            for j, pa in enumerate(h[i:i+n_col]):
                for rect in pa.patches: # for each index
                    rect.set_x(rect.get_x() + 0.8 / float(n_df + 1) * i / float(n_col))
                    rect.set_hatch(H * int(i / n_col)) #edited part     
                    rect.set_width(0.8 / float(n_df + 1))

        axe.set_xticks((np.arange(0, 2 * n_ind, 2) + 1 / float(n_df + 1)) / 2.)
        
        if grouped == 'algorithm':
            axe.set_xticklabels(df.index, rotation = 0, fontsize=30)
        elif grouped == 'context':
            axe.set_xticklabels(df.index, rotation = 45, fontsize=30)
                
        axe.set_title("Percentage of Classification using k = {} for {} dataset".format(k, dataset), fontsize = 30)
        axe.tick_params(axis="y", labelsize=30)

        axe.axhline(50, linewidth=2, color='r', linestyle="--")

        # Add invisible data to add another legend
        n=[]        
        for i in range(n_df):
            n.append(axe.bar(0, 0, color="gray", hatch=H * i))

        l1 = axe.legend(h[:n_col], l[:n_col], loc=[1, 0.5], prop={'size': 20})#[1.01, 0.5])
                
        if labels is not None:
            l2 = plt.legend(n, labels, loc=[1, 0.1], prop={'size': 20}) #[1.01, 0.1]
        axe.add_artist(l1)
        plt.savefig("../data/plots/finalBar_{}_{}_groupedBy_{}.png".format(dataset, k, grouped), bbox_inches='tight')
        return axe
        
    else:
        # draw nested bar plot
        g = sns.catplot(x="contexts", y="Total", hue="index", data=df_transpose,
                    height=10, kind="bar", palette="muted", legend=False)
        g.set_xticklabels(rotation=90, fontsize=20)
        g.set_yticklabels(fontsize=20)
        g.set_ylabels("Classification Percentage",fontsize=30)
        g.set_xlabels("Contexts", fontsize=30)
        g.despine(left=True)
        plt.legend(loc='upper left', prop={'size': 20})
        g.savefig("../data/plots/finalBar_{}_{}_plots.png".format(dataset, k), 
                                                                  bbox_inches='tight')


In [None]:
def profileDist(datasetName, context, function):
    # load data
    df_data = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
    
    # load metadata
    if datasetName == 'BDG':
        df_meta = pd.read_csv('../data/raw/meta_open.csv')
        # get labels for all buildings
        ground_truth_labels = df_meta[df_meta.iloc[:, 0].isin(df_data.index.values)]
        ground_truth_labels = ground_truth_labels['primaryspaceusage']
        ground_truth_list = ['Office', 'Dormitory', 'College Classroom', 'Primary/Secondary Classroom', 
                        'College Laboratory']
        colors = ['r', 'g', 'b', 'k', 'y']

    else:
        df_meta = pd.read_csv('../data/raw/dgs_metadata.csv')
        df_aux = pd.read_csv("../data/raw/DGS_322_Buildings-15m-By_Building-DST-gap-filled-3-2-18-508pm.csv")
        
        # get labels for all buildings
        df_aux = df_aux.T
        df_hist = df_aux[df_aux.iloc[:, 0].isin(df_data.index.values)] # get id based on names
        df_hist = df_meta[df_meta['id'].isin(df_hist.index.values)] # get label based on id
        ground_truth_labels = df_hist['espm_type_name']
        
    
    ground_truth_labels = ground_truth_labels.reset_index(drop=True)

    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df_data.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
        
    # back to pandas
    df_scaled = pd.DataFrame(df_scaled)
    df_scaled['psu'] = ground_truth_labels
            
    plt.figure(figsize=(18,10))
    
    for index, building in df_scaled.iterrows():       
        if building[-1] == 'Office':
            idx = 0
        elif building[-1] == 'Dormitory':
            idx = 1
        elif building[-1] == 'College Classroom':
            idx = 2
        elif building[-1] == 'Primary/Secondary Classroom':
            idx = 3
        else:
            idx = 4
            
        plt.plot(building[:-1], "-", alpha=0.4, label = building[-1]) # c = colors()dx
    
    # take care of repeating label and group them
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys(), loc='center left', bbox_to_anchor=(1, 0.55), prop={'size': 20})

    # axis and title
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.title("Load Profiles curves based on {}".format(function), fontsize = 30)

    plt.show()
  