# This notebook will be the only interaction with all any experiment notebook. From here the function calls to all the preprocessing and clustering notebooks will be done

In [1]:
# Libraries needed
# !pip install nbimporter # uncomment if library is not install

In [2]:
# Existing Notebooks
import nbimporter
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from Preprocessing.preprocessing import hourly_dataset
from Preprocessing.context_extraction import getContext
from Preprocessing.load_cuve_generation import doAggregation

from ClusteringAnalysis.ClusteringValidationMetrics import get_validation_scores, combineMetrics
from ClusteringAnalysis.Kshape import doKshape
# %run ./ClusteringAnalysis/ClusteringValidationMetrics
# !jupyter nbconvert --execute ../ClusteringAnalysis/ClusteringValidationMetrics

# Built-in libraries
import time
from itertools import product
from math import log
import pickle

# NumPy, SciPy and Pandas
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd

# Scikit-Learn
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import pairwise_distances

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm


Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/preprocessing.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/context_extraction.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/Preprocessing/load_cuve_generation.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/ClusteringValidationMetrics.ipynb
Importing Jupyter notebook from /Users/matias/Documents/Education/Graduate/NUS/Projects/sensor-cluster-er/ClusteringAnalysis/Kshape.ipynb


In [3]:
# Loading Data

In [4]:
# Running Experiment 
def runExperiment(datasetName, context, function, algorithm='kshape', 
                  algo_parameter=range(2,11), validation_metrics='all'):

    # check if dataset has already being processed before
    exists_df = os.path.isfile('../data/processed/{}_dataset.csv'.format(datasetName))
    if exists_df: # if file exists, read it
        df = pd.read_csv('../data/processed/{}_dataset.csv'.format(datasetName), index_col=0)
        print("Preprocessed dataset already exists, loading it ...")
    else: # if file is missing, produce it
        df = hourly_dataset(datasetName)
        print("Preprocessing dataset ...")
        
    # check if dataset with context has already being processed before
    exists_context = os.path.isfile('../data/processed/{}_{}_dataset.csv'.format(datasetName, context))
    if exists_context: # if file exists, read it
        df = pd.read_csv('../data/processed/{}_{}_dataset.csv'.format(datasetName, context), index_col=0)
        print("Dataset with {} context already already exists, loading it ...".format(context))
    else: # if file is missing, produce it
        df = getContext(datasetName, context)
        print("Generating context dataset ...")
        
    # check if dataset with function has already being processed before
    exists_function = os.path.isfile('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function))
    if exists_function: # if file exists, read it
        df = pd.read_csv('../data/processed/{}_{}_{}_dataset.csv'.format(datasetName, context, function), index_col=0)
        print("Dataset with {} context and {} load curve aggregation function already already exists, loading it ...".format(context, function))

    else: # if file is missing, produce it
        df = doAggregation(datasetName, context, function)
        print("Generating load curves based on {} ...".format(function))
    
    scores = [] # list of the scores for each parameter for the selected algorithm
    
    # run selected algorithm with the appropiate parameter
    if algorithm == 'kshape':
        # run kshape for each value of K
        for k in algo_parameter:
            model, labels = doKshape(df, datasetName, k, seed=3, max_iter=200)
            scores.append(get_validation_scores(df.values, labels))
            print("Running experiment with {} and k = {}".format(algorithm, k))
   
    # name for saving the results
    obj_name = '../data/results/{}_{}_{}_{}_scores'.format(datasetName, context, function, algorithm)
    
    # update the final score dataframe
    scores = pd.DataFrame.from_dict(scores)
    scores.insert(0, 'dataset', '')
    scores['dataset'] = datasetName
    scores.insert(1, 'context', '')
    scores['context'] = context
    scores.insert(2, 'function', '')
    scores['function'] = function
    scores.insert(3, 'algorithm', '')
    scores['algorithm'] = algorithm
    if "k" in algorithm:
        scores.insert(4, 'parameter k', '')
        scores['parameter k'] = algo_parameter
        
    # save as python pickle
    f = open(obj_name + '.pkl', 'wb')
    pickle.dump(scores, f)
    f.close
    
    # save as csv
    scores.to_csv('{}.csv'.format(obj_name))
    print('Scores saved in {}.csv'.format(obj_name))
    
    return


In [5]:
# Visualizing Scores

In [6]:
# # plotting curves based on metrics
# i = 10

# filename = ["data/BDG_avg_weekday_kshape_scores.pkl", "data/BDG_median_weekday_kshape_scores.pkl", "data/BDG_regression_weekday_kshape_scores.pkl",
#             "data/BDG_avg_weekend_kshape_scores.pkl", "data/BDG_median_weekend_kshape_scores.pkl", "data/BDG_regression_weekend_kshape_scores.pkl",
#             "data/DC_avg_weekday_kshape_scores.pkl", "data/DC_median_weekday_kshape_scores.pkl", "data/DC_regression_weekday_kshape_scores.pkl",
#             "data/DC_avg_weekend_kshape_scores.pkl", "data/DC_median_weekend_kshape_scores.pkl", "data/DC_regression_weekend_kshape_scores.pkl"]

# # read pickle files for each dataset
# rangeK = range(2, 11)
# pickle_in = open(filename[i], "rb")
# df_scores = pickle.load(pickle_in)
# pickle_in.close()

# # print(df_scores[0])
# # combine all metrics for each K value (of current dataset)
# k_metrics = combineMetrics(df_scores[0], rangeK)

# metric_index = 0
# f, axarr = plt.subplots(len(k_metrics), sharex=False, figsize =(10,30))
# # iterate through every metric and plot the value versus the correspondant k value
# for metric in k_metrics.keys():
#     axarr[metric_index].plot(rangeK, k_metrics.get(metric), "k-")
#     axarr[metric_index].set_title("{} curve over K values".format(metric), fontsize = 18)

#     metric_index += 1
# plt.show()
# f.savefig("img/{}_kshape_plot.png".format(dataframes_names[i]), bbox_inches='tight')
