# This notebook will be the only interaction with all any experiment notebook. From here the function calls to all the preprocessing and clustering notebooks will be done

In [1]:
# Libraries needed
# !pip install nbimporter # uncomment if library is not install

In [2]:
# Existing Notebooks
import nbimporter
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from Preprocessing.preprocessing import hourly_dataset
from Preprocessing.context_extraction import getContext
from Preprocessing.load_cuve_generation import doAggregation

from ClusteringAnalysis.ClusteringValidationMetrics import get_validation_scores, combineMetrics
from ClusteringAnalysis.Kshape import doKshape

# Built-in libraries
import time
from itertools import product
from math import log
import pickle

# NumPy, SciPy and Pandas
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd

# Scikit-Learn
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import pairwise_distances

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm



Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/preprocessing.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/context_extraction.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/load_cuve_generation.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/ClusteringValidationMetrics.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/Kshape.ipynb


In [8]:
# check files
def checkFiles(datasetName, context, function):
    # if the dataset is the combination, directly load the aggregated dataset for the datasets
    if datasetName == 'BDG-DGS':
        df1_name = 'BDG'
        df2_name = 'DGS'

        df1 = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(df1_name, context, function), index_col=0)
        df2 = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(df2_name, context, function), index_col=0)
    
        df = df1.append(df2)
        df.to_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function))
    else:
        # check if dataset has already being processed before
        exists_df = os.path.isfile('../data/processed/{}_dataset.csv'.format(datasetName))
        if exists_df: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_dataset.csv'.format(datasetName), index_col=0)
            print("Preprocessed dataset already exists, loading it ...")
        else: # if file is missing, produce it
            df = hourly_dataset(datasetName)
            print("Preprocessing dataset ...")

        # check if dataset with context has already being processed before
        exists_context = os.path.isfile('../data/processed/{}_{}_dataset.csv'.format(datasetName, context))
        if exists_context: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_{}_dataset.csv'.format(datasetName, context), index_col=0)
            print("Dataset with {} context already exists, loading it ...".format(context))
        else: # if file is missing, produce it
            df = getContext(datasetName, context)
            print("Generating context dataset ...")

        # check if dataset with function has already being processed before
        exists_function = os.path.isfile('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function))
        if exists_function: # if file exists, read it
            df = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
            print("Dataset with {} context and {} load curve aggregation function already exists, loading it ...".format(context, function))

        else: # if file is missing, produce it
            df = doAggregation(datasetName, context, function)
            print("Generating load curves based on {} ...".format(function))
    
    return df

In [10]:
# Running Experiment 
def runExperiment(datasetName, context, function, algorithm='kshape', 
                  algo_parameter=range(2,11), validation_metrics='all', appendTotalFile=False):
    # check if file exists and load it
    df = checkFiles(datasetName, context, function)
    
    scores = [] # list of the scores for each parameter for the selected algorithm
    
    # run selected algorithm with the appropiate parameter
    if algorithm == 'kshape':
        # run kshape for each value of K
        for k in algo_parameter:
            model, labels = doKshape(df, datasetName, k, seed=3, max_iter=200)
            scores.append(get_validation_scores(df.values, labels))
            print("Running experiment with {} and k = {}".format(algorithm, k))
   
    # name for saving the results
    obj_name = '../data/results/{}_{}_{}_{}_scores'.format(datasetName, context, function, algorithm)
    
    # update the final score dataframe
    scores = pd.DataFrame.from_dict(scores)
    scores.insert(0, 'dataset', '')
    scores['dataset'] = datasetName
    scores.insert(1, 'context', '')
    scores['context'] = context
    scores.insert(2, 'function', '')
    scores['function'] = function
    scores.insert(3, 'algorithm', '')
    scores['algorithm'] = algorithm
    if "k" in algorithm:
        scores.insert(4, 'parameter k', '')
        scores['parameter k'] = algo_parameter
        
    # save as python pickle
    f = open(obj_name + '.pkl', 'wb')
    pickle.dump(scores, f)
    f.close
    
    # save as csv
    scores.to_csv('{}.csv'.format(obj_name))
    print('Scores saved in {}.csv'.format(obj_name)) # individual file
    
    if appendTotalFile:
        with open('../data/results/total_scores.csv', 'a') as f: # append to general file
            scores.to_csv(f, header=False)
    
    return


In [5]:
# Generating clusters and centroids
def generateClusters(datasetName, context, function, algorithm='kshape', algo_parameter = 5):
    # check if file exists and load it
    df = checkFiles(datasetName, context, function)
    
    # run algorithm
    if algorithm == 'kshape':
        # run kshape with the given k's (can be a list)
        for k in algo_parameter:
            model, labels = doKshape(df, datasetName, k, seed=3, max_iter=200, plot=True)
#             scores.append(get_validation_scores(df.values, labels))
            print("Running experiment with {} and k = {}".format(algorithm, k))
    
    return

In [6]:
# Visualizing Scores
def generateMetricPlots(datasetName, context, function, algorithm='kshape', showPlots=False):
    plt.ioff() # this way only plt.show() will display figures

    pickle_in = open("../data/results/{}_{}_{}_{}_scores.pkl".format(datasetName, context, function, algorithm), 
                         "rb")
    df_scores = pickle.load(pickle_in)
    pickle_in.close()
    
    # the x-axis is the different parameter values of the algorith,
    if algorithm == 'kshape': # in this case, the parameter is the value of k
        x_axis = list(df_scores.iloc[:, 4]) # the column of index 4 is where the parameter is stored
    
    # dataframe of only the validation metrics
    df_metrics = df_scores.iloc[:, range(5, len(df_scores.columns))]
    # the total number of metrics available is 7
    num_metrics = len(df_metrics.columns) # but we double check just in case
    # extract column names for plotting
    metric_names = df_metrics.columns.values
    metric_index = 0
    
    # iterate through every metric and plot the value versus the correspondant algo parameter
    f, axarr = plt.subplots(num_metrics, sharex=False, figsize =(10,30))
    for metric in range(len(df_metrics.columns)):
        axarr[metric_index].plot(x_axis, df_metrics.iloc[:, metric], "k-")
        axarr[metric_index].set_title("{} curve over K values".format(metric_names[metric_index]), fontsize = 18)
        metric_index += 1
    
    # if boolean parameter for plotting is True, show the figure
    if showPlots:
        plt.show()
    
    f.savefig("../data/plots/{}_{}_{}_{}_plots.png".format(datasetName, context, function, algorithm), 
                                                              bbox_inches='tight')
    print("Plots saved in ../data/plots/{}_{}_{}_{}_plots.png".format(datasetName, context, function, algorithm))

    return
