In [2]:
# NumPy, SciPy and Pandas
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd

# Tslearn
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.utils import to_time_series_dataset
from tslearn.clustering import KShape

# Github https://github.com/Mic92/kshape
from kshape.core import kshape, zscore

# Scikit-Learn
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

# Matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [1]:
def doClustering(dataframe, dataframe_name, algorithm, k, seed=3, max_iter=200, plot=False):
    # dataframe is already a 2D array
    other_kshape = False
    
    # choose algorithm #TODO: this could be optimized with classes
    if algorithm == 'kshape':
#         model = KShape(n_clusters=k, verbose=False, random_state=seed, max_iter=max_iter)

        # other implementation of kshape https://github.com/Mic92/kshape
        model = kshape(dataframe, k) # list of tuples with (cluster_center, index of instance in this cluster)
        other_kshape = True
        # extract parameters
        cluster_centers = []
        y_pred = pd.DataFrame()
        y_pred.loc[:, 0] = [0] * dataframe.shape[0] # initilize cluster membership
        # for each tuple
        for yi in range(k):
            cluster_centers.append(model[yi][0]) # get cluster centers as first element of tuple
            y_pred.loc[model[yi][1], 0] = yi # update cluster membership
        # make them a list
        y_pred = y_pred.iloc[:, 0].values 
        
    elif algorithm == 'kmeans':
        model = KMeans(n_clusters=k, random_state=seed, max_iter=max_iter)
    elif algorithm == 'hierarchical':
        model = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
    
    sz = dataframe.shape[1] # length of each time series
    
    if not other_kshape:
        y_pred = model.fit_predict(dataframe) # fit the data and generate the cluster labels

    if plot: # for each cluster generate a plot
        fig = plt.figure(figsize=(20, 10))
        for yi in range(k):
            plt.subplot(k, 1, 1 + yi)

            # for each time series in the scaled dataframe
            for xx in dataframe[y_pred == yi]:
                plt.plot(xx.ravel(), "k-", alpha=.2)

            # add the centroid (in red) to the plot
            if algorithm != 'hierarchical' and not other_kshape:
                plt.plot(model.cluster_centers_[yi].ravel(), "r-", linewidth=3)

            if other_kshape: # testing the other kshape
                plt.plot(cluster_centers[yi].ravel(), "r-", linewidth=3)

            plt.xlim(0, sz)
            plt.ylim(-4, 4)
            plt.title("Cluster %d" % (yi + 1), fontsize = 30)
        fig.suptitle("Dataset: {}".format(dataframe_name), fontsize = 35)
        
    # return the clusterlabels and the cluster centers
    return model, y_pred