In [None]:
# default_exp core

# core

> API details.

In [2]:
# hide
from nbdev.showdoc import *

In [3]:
# export

import numpy as np
import pandas as pd
from tslearn.clustering import TimeSeriesKMeans
from netdata_pandas.data import get_data
from am4894plots.plots import plot_lines_grid

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [4]:
# export

class Clusterer:
    """
    """
    
    def __init__(self, 
                 hosts: list, charts: list, after: int, before: int, diff: bool = False, norm: bool = True, 
                 smooth_n: int = 5, smooth_func: str = 'mean', n_clusters: int = 10, min_n: int = 4):
        self.hosts = hosts
        self.charts = charts
        self.after = after
        self.before = before
        self.diff = diff
        self.norm = norm
        self.smooth_n = smooth_n
        self.smooth_func = smooth_func
        self.n_clusters = n_clusters
        self.min_n = min_n
        self.cluster_quality_dict = {}
        
    def get_data(self):
        """
        """
        self.df = get_data(self.hosts, self.charts, after=self.after, before=self.before, user=None, pwd=None)
        
    def preprocess_data(self):
        """
        """
        if self.diff:
            self.df = self.df.diff()
        if self.smooth_n > 0:
            if self.smooth_func == 'mean':
                self.df = self.df.rolling(self.smooth_n).mean().dropna(how='all')
            elif self.smooth_func == 'max':
                self.df = self.df.rolling(self.smooth_n).max().dropna(how='all')
            elif self.smooth_func == 'min':
                self.df = self.df.rolling(self.smooth_n).min().dropna(how='all')
            elif self.smooth_func == 'sum':
                self.df = self.df.rolling(self.smooth_n).sum().dropna(how='all')
            elif self.smooth_func == 'median':
                self.df = self.df.rolling(self.smooth_n).median().dropna(how='all')
            else:
                self.df = self.df.rolling(self.smooth_n).mean().dropna(how='all')
        if self.norm:
            self.df = (self.df-self.df.min())/(self.df.max()-self.df.min())
        self.df = self.df.dropna(axis=1, how='all')
        self.df = self.df.set_index(pd.to_datetime(self.df.index, unit='s'))
    
    def cluster_data(self):
        """
        """
        self.model = TimeSeriesKMeans(
            n_clusters=self.n_clusters, metric="euclidean", max_iter=10, n_init=2
        ).fit(self.df.transpose().values)
        self.df_cluster = pd.DataFrame(list(zip(self.df.columns, self.model.labels_)), columns=['metric', 'cluster'])
        self.cluster_metrics_dict = self.df_cluster.groupby(['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()
        self.cluster_len_dict = self.df_cluster['cluster'].value_counts().to_dict()
        self.clusters_all = [cluster for cluster in self.cluster_len_dict]
        self.clusters_dropped = [cluster for cluster in self.cluster_len_dict if self.cluster_len_dict[cluster]<self.min_n]
        self.clusters_final = [cluster for cluster in self.cluster_len_dict if self.cluster_len_dict[cluster]>=self.min_n]
        
    def generate_quality_scores(self):
        """
        """
        for cluster_number in self.clusters_all:
            self.x_corr = self.df[self.cluster_metrics_dict[cluster_number]].corr().abs().values
            self.x_corr_mean = round(self.x_corr[np.triu_indices(self.x_corr.shape[0],1)].mean(),2)
            self.cluster_quality_dict[cluster_number] = self.x_corr_mean
            
    def generate_df_cluster_meta(self):
        """
        """
        self.df_cluster_meta = pd.DataFrame.from_dict(self.cluster_len_dict, orient='index', columns=['n'])
        self.df_cluster_meta.index.names = ['cluster']
        self.df_cluster_meta['quality_score'] = self.df_cluster_meta.index.map(self.cluster_quality_dict)
        self.df_cluster_meta = self.df_cluster_meta.sort_values('quality_score', ascending=False)
        
    def generate_df_cluster_centers(self):
        """
        """
        self.df_cluster_centers = pd.DataFrame(self.model.cluster_centers_.reshape(self.model.cluster_centers_.shape[0],self.model.cluster_centers_.shape[1])).transpose()
        self.df_cluster_centers.index = self.df.index
        
    def generate_cluster_centers_plot(self):
        """
        """
        titles = [f'{x[0]} - n={x[2]}, qs={x[1]}' for x in list(zip(list(self.df_cluster_meta.index),list(self.df_cluster_meta.quality_score),list(self.df_cluster_meta.n)))]
        self.fig_centers = plot_lines_grid(
            self.df_cluster_centers[list(self.df_cluster_meta.index)], subplot_titles=titles, return_p=True, h_each=75, w=1000, 
            legend=False, yaxes_visible=False, xaxes_visible=False, show_p=False
        )
        
    def run_all(self):
        """
        """
        self.get_data()
        self.preprocess_data()
        self.cluster_data()
        self.generate_quality_scores()
        self.generate_df_cluster_meta()
        self.generate_df_cluster_centers()
        self.generate_cluster_centers_plot()



In [5]:
# hide
# tests

model = Clusterer(
    hosts=['london.my-netdata.io'],
    charts=['system.cpu'],
    after=-60,
    before=0,
    n_clusters=3
)
model.get_data()
assert model.df.shape == (60,9)