# This notebook will be the only interaction with all any experiment notebook. From here the function calls to all the preprocessing and clustering notebooks will be done

In [1]:
# Libraries needed
# !pip install nbimporter # uncomment if library is not install

In [1]:
# Existing Notebooks
import nbimporter
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from Preprocessing.preprocessing import hourly_dataset
from Preprocessing.context_extraction import getContext
from Preprocessing.load_cuve_generation import doAggregation

from ClusteringAnalysis.ClusteringValidationMetrics import get_validation_scores
from ClusteringAnalysis.ClusteringAlgorithms import doClustering

# Built-in libraries
import time
from itertools import product
from math import log
import pickle
from datetime import datetime, timedelta # testing

# NumPy, SciPy and Pandas
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd

# Scikit-Learn
from sklearn.metrics import silhouette_samples # , silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from tslearn.clustering import silhouette_score
# Tslearn
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.utils import to_time_series_dataset

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.gridspec as gridspec


Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/preprocessing.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/context_extraction.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/load_cuve_generation.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/ClusteringValidationMetrics.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/ClusteringAlgorithms.ipynb


In [8]:
# check files
def checkFiles(datasetName, context, function):
    # if the dataset is the combination, directly load the aggregated dataset for the datasets
    if datasetName == 'BDG-DGS':
        df1_name = 'BDG'
        df2_name = 'DGS'

        df1 = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(df1_name, context, function), index_col=0)
        df2 = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(df2_name, context, function), index_col=0)
    
        df = df1.append(df2)
        df.to_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function))
    else:
        # check if dataset has already being processed before
        exists_df = os.path.isfile('../data/processed/{}_dataset.csv'.format(datasetName))
        if exists_df: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_dataset.csv'.format(datasetName), index_col=0)
            print("Preprocessed dataset already exists, loading it ...")
        else: # if file is missing, produce it
            df = hourly_dataset(datasetName)
            print("Preprocessing dataset ...")

        # check if dataset with context has already being processed before
        exists_context = os.path.isfile('../data/processed/{}_{}_dataset.csv'.format(datasetName, context))
        if exists_context: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_{}_dataset.csv'.format(datasetName, context), index_col=0)
            print("Dataset with {} context already exists, loading it ...".format(context))
        else: # if file is missing, produce it
            df = getContext(datasetName, context)
            print("Generating context dataset ...")

        # check if dataset with function has already being processed before
        exists_function = os.path.isfile('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function))
        if exists_function: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
            print("Dataset with {} context and {} load curve aggregation function already exists, loading it ...".format(context, function))

        else: # if file is missing, produce it
            df = doAggregation(datasetName, context, function)
            print("Generating load curves based on {} ...".format(function))
    
    return df

In [15]:
# Running Experiment 
def runExperiment(datasetName, context, function, algorithm='kshape', 
                  algo_parameter=range(2,11), validation_metrics='all', appendTotalFile=False):
    print("Running Experiment with dataset: {}, context:  {}, function: {}, algorithm: {}".format(datasetName, 
                                                                                                context,
                                                                                                function,
                                                                                                algorithm))
    # check if file exists and load it
    df = checkFiles(datasetName, context, function)
    
    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
    
    scores = [] # list of the scores for each parameter for the selected algorithm
    
    # run selected algorithm with the appropiate parameter
    for k in algo_parameter:
        model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=200)
        scores.append(get_validation_scores(df_scaled, labels))
        print("Running experiment with {} and k = {}".format(algorithm, k))
    
    # name for saving the results
    obj_name = '../data/results/{}_{}_{}_{}_scores'.format(datasetName, context, function, algorithm)
    
    # update the final score dataframe
    scores = pd.DataFrame.from_dict(scores)
    scores.insert(0, 'dataset', '')
    scores['dataset'] = datasetName
    scores.insert(1, 'context', '')
    scores['context'] = context
    scores.insert(2, 'function', '')
    scores['function'] = function
    scores.insert(3, 'algorithm', '')
    scores['algorithm'] = algorithm
    if "k" in algorithm or algorithm == 'hierarchical':
        scores.insert(4, 'parameter k', '')
        scores['parameter k'] = algo_parameter
    
    # approximate to two decimals
    scores = scores.round(2)
    
    # save as python pickle
    f = open(obj_name + '.pkl', 'wb')
    pickle.dump(scores, f)
    f.close
    
    # save as csv
    scores.to_csv('{}.csv'.format(obj_name))
    print("Scores saved in {}.csv\n".format(obj_name)) # individual file
    
    if appendTotalFile:
        with open('../data/results/total_scores.csv', 'a') as f: # append to general file
            scores.to_csv(f, header=False)
    
    return


In [None]:
# Run batch of experiments
def runBatchExperiments(datasetName_list, context_list, function_list, algorithm_list, 
                  algo_parameter=range(2,11), validation_metrics='all'):
    # nested loops for each possible combination of the lists
    for datasetName in datasetName_list: # every dataset
        for context in context_list: # every context
            for function in function_list: # every load aggregation function
                for algorithm in algorithm_list: # every algorithm
                    runExperiment(datasetName, context, function, algorithm, algo_parameter, validation_metrics, True)
                        

In [5]:
# Generating clusters and centroids
def generateClusters(datasetName, context, function, algorithm='kshape', algo_parameter = 5):
    # check if file exists and load it
    df = checkFiles(datasetName, context, function)
        
    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
        
    # run selected algorithm with the appropiate parameter
    for k in algo_parameter:
        model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300, plot=True)    
    plt.show()
    return

In [6]:
# TODO
# Visualizing Scores
def generateMetricPlots(datasetName, context, function, algorithm='kshape', showPlots=False):
    plt.ioff() # this way only plt.show() will display figures
    
    pickle_in = open("../data/results/{}_{}_{}_{}_scores.pkl".format(datasetName, context, function, algorithm), 
                         "rb")
    df_scores = pickle.load(pickle_in)
    pickle_in.close()
    
    # the x-axis is the different parameter values of the algorith,
    x_axis = list(df_scores.iloc[:, 4]) # the column of index 4 is where the parameter is stored
    
    # dataframe of only the validation metrics
    df_metrics = df_scores.iloc[:, range(5, len(df_scores.columns))]
    # the total number of metrics available is 7
    num_metrics = len(df_metrics.columns) # but we double check just in case
    # extract column names for plotting
    metric_names = df_metrics.columns.values
    metric_index = 0
    
    # iterate through every metric and plot the value versus the correspondant algo parameter
    f, axarr = plt.subplots(num_metrics, sharex=False, figsize =(10,30))
    for metric in range(len(df_metrics.columns)):
        axarr[metric_index].plot(x_axis, df_metrics.iloc[:, metric], "k-")
        axarr[metric_index].set_title("{} curve over K values".format(metric_names[metric_index]), fontsize = 18)
        metric_index += 1
    
    # if boolean parameter for plotting is True, show the figure
    if showPlots:
        plt.show()
    
    f.savefig("../data/plots/{}_{}_{}_{}_plots.png".format(datasetName, context, function, algorithm), 
                                                              bbox_inches='tight')
    print("Plots saved in ../data/plots/{}_{}_{}_{}_plots.png".format(datasetName, context, function, algorithm))

    return


In [None]:
def plotSilhouette(datasetName, context, function, algorithm, k):
    plt.ioff() # this way only plt.show() will display figures
    
    df = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
     
    # Create a subplot with 1 row and 2 columns
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (k+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(df.values) + (k + 1) * 10])

    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df.values)
    df_scaled = np.squeeze(df_scaled)
    
    clusterer, cluster_labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    
    silhouette_avg = round(silhouette_score(df_scaled, cluster_labels), 2) # round to two decimals
    print("For k =", k, "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(df_scaled, cluster_labels)

    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize = 30)

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("Silhouette analysis for {} {} using {}".format(context, function, algorithm), fontsize = 30)
    ax1.set_xlabel("Silhouette coefficient values", fontsize = 30)
    ax1.set_ylabel("Cluster label", fontsize = 30)

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.3,-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)

    plt.show()
    
    return

In [None]:
def plotCurvesOneBuilding(datasetName, context, function, resolution='day'):
    dataframe = pd.read_csv('../data/processed/{}_{}_dataset.csv'.format(datasetName, context), parse_dates=True, 
                            infer_datetime_format=True, index_col=0)
    df_load_curves = pd.DataFrame() # dataframe that will hold all load curves

    # resample based on parameter
    if (resolution == 'day'):
        availableSamples = (dataframe.resample('1D').asfreq()).index # get list of timestamps group by day
        delta = 23 # timedelta based on resample
    else:
        print("Please choose a valid resolution")
        exit()

    # iterate through all buildings (column)
    for column in range(len(dataframe.columns)):
        df_sampledReadings = pd.DataFrame() # dataframe to hold new samples for a column
        currentColumn = pd.DataFrame(dataframe.iloc[:, column])
        
        # iterate through each day
        for timestamp in availableSamples:
            # update time limits to the current date
            start = timestamp
            end = timestamp + timedelta(hours=delta)
            # get meter data from only this resolution
            df_reading = currentColumn[(currentColumn.index >= start) & (currentColumn.index <= end)]
            # ignore index since they are unique timestamps
            df_reading.reset_index(drop=True, inplace=True)         
            # append new sample as columns
            df_sampledReadings = pd.concat([df_sampledReadings, df_reading], axis=1)
            
        # make sure sure there are no columns with NaN values
        df_sampledReadings.dropna(axis=1, how='all', inplace=True)
        df_sampledReadings = df_sampledReadings.T # transpose it so it's easier to see and operate
        # up to this point, the matrix above has the shape nxm where is the number of instances and m is the number of readings
    
        # if any NaN prevailed
        df_sampledReadings.fillna(value=0, inplace=True) 

        # calculate load curve based on function
        if function == 'average':
            load_curve = np.mean(df_sampledReadings, axis = 0)

        elif function =='median':
            load_curve = np.median(df_sampledReadings, axis = 0)

        else:
            print("Please choose a valid context")
            exit()

        ###################################################################
#         TODO: coding is for plotting purposes
        plt.figure(figsize=(18,10))
        plt.ylim(0,7)
        x_axis = range(0, len(df_sampledReadings.columns))
        for _, curve in df_sampledReadings.iterrows():
            plt.plot(curve, "k-", alpha=.2)
#         plt.plot(load_curve, "r-")
        plt.title("Load Profiles and red representative curve based on {}".format(function))        
        plt.show()

        plt.figure(figsize=(18,10))

        plt.plot(load_curve, "r-", linewidth=7.0)
        
        plt.ylim(0,7)
        
        ###################################################################

        # turn into one column dataframe for easier manipulation
        load_curve = pd.DataFrame(load_curve)
        # keep the instance name as column name
        instance_name = []
        instance_name.append(df_sampledReadings.index[0])
        load_curve.columns = instance_name
        # append current load curve to dataframe
        df_load_curves = pd.concat([df_load_curves, load_curve], axis=1)
        
        # end of for loop for one column
    return

In [None]:
def plotSC():
    df = pd.read_csv('../data/results/total_scores.csv', index_col=0)
    
    
    datasets = ['BDG', 'DGS', 'BDG-DGS']
    algorithms = ['kshape', 'kmeans', 'hierarchical']
    contexts = ['weekday', 'weekend', 'fullweek']
    function = ['average', 'median']
    
    # Four axes, returned as a 2-d array
    f, axarr = plt.subplots(3, 3, figsize=(20,18))
    f.subplots_adjust(hspace=0.05, wspace=0.05)
    
    for i in range(3): # rows
        df_row = df[df.iloc[:, 0] == datasets[i]]
#         print(i)
#         print(df_row.iloc[0, 0])
        for j in range(3): # columns, different algorithm
            
            df_experiment = df_row.copy()
            df_experiment = df_experiment[df_experiment.iloc[:, 3] == algorithms[j]]
            df_experiment_context = df_experiment.iloc[:, 0:4]
            df_experiment_context.drop_duplicates(inplace = True) 
#             print(j)
#             print(df_experiment.iloc[0, 3])
            # one plot for each context
            for _, context in df_experiment_context.iterrows():
                current_plot = df_experiment[(df_experiment.iloc[:, 1] == context[1]) &
                                            (df_experiment.iloc[:, 2] == context[2])]
#                 print(current_plot)
                axarr[i, j].set_ylim([-0.05, 0.6])
                x_values = current_plot.iloc[:, 4]
                y_values = current_plot[' ']
                context_name = str(context[1]) + " " +  str(context[2])
                axarr[i, j].plot(x_values, y_values, label=context_name)
                axarr[i, j].xaxis.grid(True)

    # Fine-tune figure; hide x ticks for top plots and y ticks for right plots
    cols = ['K-Shape', 'K-Means', 'Hierarchical']
    for ax, col in zip(axarr[0], cols):
        ax.set_title(col, fontsize = 30)

    for ax, row in zip(axarr[:,0], datasets):
        ax.set_ylabel(row, rotation=90, fontsize=30)
    
    f.text(0.5, 0.08, 'K', ha='center', fontsize=30)
    
    plt.legend(loc='center left', bbox_to_anchor=(1, 1.55), prop={'size': 20})
    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)

    plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False)
    plt.setp([a.get_xticklabels() for a in axarr[1, :]], visible=False)
    plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False)
    plt.setp([a.get_yticklabels() for a in axarr[:, 2]], visible=False)
    plt.show()
    f.savefig("../data/plots/silhouette_all_plots.png")


In [2]:
def plotGroundTruthDist(datasetName, context, function, algorithm, k):
    # load metadata
    if datasetName == 'BDG':
        df_meta = pd.read_csv('../data/raw/meta_open.csv', index_col=0)
    else:
        return
    # load data
    df_data = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)

    # Z-normalize the data
    df_scaled = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(df_data.values)
    # df.values will generate a 3d array (the third dimension being 1) so we convert it to a 2d array
    df_scaled = np.squeeze(df_scaled) # now is a 2d array
    
    # run algorithm
    model, labels = doClustering(df_scaled, datasetName, algorithm, k, seed=3, max_iter=300)
    
    # get number of elements in each cluster
    data_dict = {i: np.where(labels == i)[0] for i in range(k)}

    df_dist = pd.DataFrame()
    df_dist_aux = pd.DataFrame()
    for key, value in data_dict.items():

        print(len(value.tolist()))
        print(key)
        
        ground_truth_labels = df_meta.iloc[value.tolist()]
        df_dist_aux = pd.concat([df_dist_aux, ground_truth_labels['primaryspaceusage']], ignore_index=True, axis=1)
        
        # count the times the different ground truth labels exist in each cluster
        df_dist[key] = ground_truth_labels['primaryspaceusage'].value_counts()

    df_dist.index.name = 'PSU'

    print(df_dist)
    
    # plot stacked bar
    ax = df_dist.T.plot.barh(stacked=True, figsize=(20, 18))
    plt.legend(loc='top right', prop={'size': 30})

    plt.rc('xtick',labelsize=20)
    plt.rc('ytick',labelsize=20)
    ax.xaxis.grid(True)
    return