In [1]:
# default_exp core

# core

> API details.

In [2]:
# hide
from nbdev.showdoc import *

In [3]:
# export

import numpy as np
import pandas as pd
from tslearn.clustering import TimeSeriesKMeans
from netdata_pandas.data import get_data

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [4]:
# export

class Clusterer:
    """
    """
    
    def __init__(self, 
                 hosts: list, charts: list, after: int, before: int, diff: bool = False, norm: bool = True, 
                 smooth_n: int = 5, smooth_func: str = 'mean', n_clusters: int = 10, min_n: int = 4):
        self.hosts = hosts
        self.charts = charts
        self.after = after
        self.before = before
        self.diff = diff
        self.norm = norm
        self.smooth_n = smooth_n
        self.smooth_func = smooth_func
        self.n_clusters = n_clusters
        self.min_n = min_n
        self.cluster_quality_dict = {}
        
    def get_data(self):
        """
        """
        self.df = get_data(self.hosts, self.charts, after=self.after, before=self.before, user=None, pwd=None)
        
    def preprocess_data(self):
        """
        """
        if self.diff:
            self.df = self.df.diff()
        if self.smooth_n > 0:
            if self.smooth_func == 'mean':
                self.df = self.df.rolling(self.smooth_n).mean().dropna(how='all')
            elif self.smooth_func == 'max':
                self.df = self.df.rolling(self.smooth_n).max().dropna(how='all')
            elif self.smooth_func == 'min':
                self.df = self.df.rolling(self.smooth_n).min().dropna(how='all')
            elif self.smooth_func == 'sum':
                self.df = self.df.rolling(self.smooth_n).sum().dropna(how='all')
            elif self.smooth_func == 'median':
                self.df = self.df.rolling(self.smooth_n).median().dropna(how='all')
            else:
                self.df = self.df.rolling(self.smooth_n).mean().dropna(how='all')
        if self.norm:
            self.df = (self.df-self.df.min())/(self.df.max()-self.df.min())
        self.df = self.df.dropna(axis=1, how='all')
        self.df = self.df.set_index(pd.to_datetime(self.df.index, unit='s'))
    
    def cluster_data(self):
        """
        """
        self.model = TimeSeriesKMeans(
            n_clusters=self.n_clusters, metric="euclidean", max_iter=10, n_init=2
        ).fit(self.df.transpose().values)
        self.df_cluster = pd.DataFrame(list(zip(self.df.columns, self.model.labels_)), columns=['metric', 'cluster'])
        self.cluster_metrics_dict = self.df_cluster.groupby(['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()
        self.cluster_len_dict = self.df_cluster['cluster'].value_counts().to_dict()
        self.clusters_all = [cluster for cluster in self.cluster_len_dict]
        self.clusters_dropped = [cluster for cluster in self.cluster_len_dict if self.cluster_len_dict[cluster]<self.min_n]
        self.clusters_final = [cluster for cluster in self.cluster_len_dict if self.cluster_len_dict[cluster]>=self.min_n]
        
    def generate_quality_scores(self):
        """
        """
        for cluster_number in self.clusters_all:
            self.x_corr = self.df[self.cluster_metrics_dict[cluster_number]].corr().abs().values
            self.x_corr_mean = round(self.x_corr[np.triu_indices(self.x_corr.shape[0],1)].mean(),2)
            self.cluster_quality_dict[cluster_number] = self.x_corr_mean
            
    def generate_df_cluster_meta(self):
        """
        """
        self.df_cluster_meta = pd.DataFrame.from_dict(self.cluster_len_dict, orient='index', columns=['n'])
        self.df_cluster_meta.index.names = ['cluster']
        self.df_cluster_meta['quality_score'] = self.df_cluster_meta.index.map(self.cluster_quality_dict)
        self.df_cluster_meta = self.df_cluster_meta.sort_values('quality_score', ascending=False)
        
    def generate_df_cluster_centers(self):
        """
        """
        self.df_cluster_centers = pd.DataFrame(self.model.cluster_centers_.reshape(self.model.cluster_centers_.shape[0],self.model.cluster_centers_.shape[1])).transpose()
        self.df_cluster_centers.index = self.df.index



In [5]:
# hide
# tests

model = Clusterer(
    hosts=['london.my-netdata.io'],
    charts=['system.cpu'],
    after=-900,
    before=0,
    n_clusters=3
)
model.get_data()
#assert model.df.shape == (60,9)

model.preprocess_data()
model.df.head()

Unnamed: 0_level_0,system.cpu|iowait,system.cpu|softirq,system.cpu|steal,system.cpu|system,system.cpu|user
time_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-12-13 15:16:16,0.0,0.196365,0.58675,0.308037,0.179805
2020-12-13 15:16:17,0.0,0.196365,0.765475,0.445381,0.220774
2020-12-13 15:16:18,0.0,0.0,0.805596,0.479205,0.219811
2020-12-13 15:16:19,0.0,0.0,0.829818,0.371518,0.307346
2020-12-13 15:16:20,0.0,0.0,0.814642,0.408105,0.264008


In [6]:
model.cluster_data()
model.model

TimeSeriesKMeans(max_iter=10, n_init=2)

In [7]:
model.df_cluster.head()

Unnamed: 0,metric,cluster
0,system.cpu|iowait,0
1,system.cpu|softirq,2
2,system.cpu|steal,1
3,system.cpu|system,1
4,system.cpu|user,1


In [8]:
model.generate_quality_scores()
model.cluster_quality_dict

  self.x_corr_mean = round(self.x_corr[np.triu_indices(self.x_corr.shape[0],1)].mean(),2)
  ret = ret.dtype.type(ret / rcount)


{1: 0.2, 2: nan, 0: nan}

In [9]:
model.generate_df_cluster_meta()
model.df_cluster_meta.head()

Unnamed: 0_level_0,n,quality_score
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3,0.2
2,1,
0,1,


In [10]:
model.generate_df_cluster_centers()
model.df_cluster_centers.head()

Unnamed: 0_level_0,0,1,2
time_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-12-13 15:16:16,0.0,0.358197,0.196365
2020-12-13 15:16:17,0.0,0.47721,0.196365
2020-12-13 15:16:18,0.0,0.501538,0.0
2020-12-13 15:16:19,0.0,0.502894,0.0
2020-12-13 15:16:20,0.0,0.495585,0.0
