<a href="https://colab.research.google.com/github/andrewm4894/colabs/blob/master/netdata_agent_anomaly_detection_python_minimal_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install dlib
#!pip install dlib

## Imports

In [2]:
## IMPORTS ##

import pprint as pp
import numpy as np
import requests
import time
#import dlib
from sklearn.cluster import KMeans, Birch
from scipy.spatial.distance import cdist

## Inputs

In [3]:
## INPUTS ##

# how many steps to run
n_steps = 100

# host to use data from
host = 'london.my-netdata.io'

## Model Config

In [4]:
## MODEL CONFIG ##

# define the config for each model we want a score for
config = {
    # a nice name for the model
    "just_cpu_user_and_system": {
        # define what metrics we want
        "metrics": {
            # chart: ['dims']
            "system.cpu": ["user", "system"]
            },
        # params specific to the model
        "params": {
            "diffs": True, "n_smoothing": 3, "train_data_size": 25, 
            "train_every_n": 50, "n_lags": 3, "n_clusters": 2, "anomaly_score_cap": 5
            }
    }
}

pp.pprint(config)

{'just_cpu_user_and_system': {'metrics': {'system.cpu': ['user', 'system']},
                              'params': {'anomaly_score_cap': 5,
                                         'diffs': True,
                                         'n_clusters': 2,
                                         'n_lags': 3,
                                         'n_smoothing': 3,
                                         'train_data_size': 25,
                                         'train_every_n': 50}}}


## Some Set Up

In [5]:
# define a mapping from models to data keys used later to combine preprocessed data into model data for one or more metrics
model_data_key_map = {}
for model in config:
    for chart in config[model]['metrics']:
        for dim in config[model]['metrics'][chart]:
            data_key = f'{chart}|{dim}'
            if model not in model_data_key_map:
                model_data_key_map[model] = [data_key]
            else:
                model_data_key_map[model].append(data_key)

pp.pprint(model_data_key_map)

{'just_cpu_user_and_system': ['system.cpu|user', 'system.cpu|system']}


In [6]:
# hold onto some data for training when needed
data_history_raw = {}
data_history_processed = {}

# initialize some model objects
models = {model: None for model in config}
model_train_data = {model: [] for model in config}
model_predict_data = {model: [] for model in config}
models_meta = {model: {} for model in config}

## Helper Functions

In [7]:
## HELPER FUNCTIONS ##


def preprocess_data(x, n_smoothing, n_lags, diffs, print_steps=False):
    """Function to preprocess a vector of data. 
    1. do smoothing by averaging over rolling window of n_smoothing.
    2. take differences if specified.
    3. add lagged values to the vector and trim it accordingly.
    """
    
    if print_steps:
        print(f'n_smoothing={n_smoothing}, diffs={diffs}, n_lags={n_lags}')
        print('x:           ', x)
    
    # smoothing
    i = 0
    x_processed = []
    while i < len(x) - n_smoothing + 1:
        x_window = x[i : i + n_smoothing]
        window_average = sum(x_window) / n_smoothing
        x_processed.append(round(window_average,2))
        i += 1    
    
    if print_steps:
        print('x smoothed:  ', x_processed)

    # differences
    if diffs:
        x_processed = [round(j-i,2) for i, j in zip(x_processed[:-1], x_processed[1:])]
        
        if print_steps:
            print('x diff:      ', x_processed)

    # add lagged values
    x_processed = x_processed[-(n_lags+1):]
    
    if print_steps:
        print('x lagged:    ', x_processed)
    
    if print_steps:
        print('x processed: ', x_processed)    

    return x_processed


def get_params_from_config(config, model):
    
    diffs = config[model]['params']['diffs']
    n_smoothing = config[model]['params']['n_smoothing']
    train_data_size = config[model]['params']['train_data_size']
    train_every_n = config[model]['params']['train_every_n']
    n_lags = config[model]['params']['n_lags']
    n_clusters = config[model]['params']['n_clusters']
    anomaly_score_cap = config[model]['params']['anomaly_score_cap']

    return diffs, n_smoothing, train_data_size, train_every_n, n_lags, n_clusters, anomaly_score_cap



## Generate Anomaly Scores

In [8]:
## ALGO ##

# run each step
for n in range(n_steps):

    # for each model defined in config
    for model in config:

        # get params for the model
        diffs, n_smoothing, train_data_size, train_every_n, n_lags, n_clusters, anomaly_score_cap = get_params_from_config(config, model)

        # get latest data from /allmetrics 
        r = requests.get(f'https://{host}/api/v1/allmetrics?format=json')
        data_now_raw = r.json()

        # for each chart from the host that we are using in the model
        for chart in config[model]['metrics']:

            # for each dim we want from that chart
            for dim in config[model]['metrics'][chart]:

                # create a key value for current data we want to use
                data_key = f'{chart}|{dim}'
                data_value = data_now_raw[chart]['dimensions'][dim]['value']
                    
                # append data to relevant part of raw history data map
                if data_key not in data_history_raw:
                    data_history_raw[data_key] = [data_value]
                else:
                    data_history_raw[data_key].append(data_value)

                # limit raw history to recent data needed for preprocessing
                data_history_raw[data_key] = data_history_raw[data_key][-(n_smoothing + n_lags + 3):]

                # process current data and store it for use by training and prediction
                data_now_processed = preprocess_data(data_history_raw[data_key], n_smoothing, n_lags, diffs)
                if data_key not in data_history_processed:
                    data_history_processed[data_key] = [data_now_processed]
                else:
                    data_history_processed[data_key].append(data_now_processed)

                # print example of processing for last observation
                if n == (n_steps-1):
                    print(f'\nprinting example preprocess steps for {data_key} at step {n}\n')
                    _ = preprocess_data(data_history_raw[data_key], n_smoothing, n_lags, diffs, print_steps=True)

        # train model if needed
        if n >= train_data_size and n % train_every_n == 0:

            # gather together preprocessed data for each dim from each model            
            train_data_offset = len(data_history_processed[model_data_key_map[model][0]]) - train_data_size

            # loop over each training observation we expect to have
            for i in range(train_data_size):

                x = []
                # get the metrics for each training observation for each metric used by the model
                
                for dim in model_data_key_map[model]:

                    # extend the feature vector for the model to add the dim features
                    x.extend(data_history_processed[dim][(train_data_offset+i)])
                
                # append the training observation to the model data
                model_train_data[model].append(x)

            # just keep most recent train_data_size values
            model_train_data[model] = model_train_data[model][-train_data_size:]

            # make sure we only use training data with the expected shape
            model_n_dims = len(model_data_key_map[model])
            model_feature_vector_len = model_n_dims * (n_lags + 1)            
            train_data = [x for x in model_train_data[model] if len(x) == model_feature_vector_len]

            print(f'...training model {model} at step {n} on {len(train_data)} processed observations each a list of {len(train_data[-1])} numbers')

            # fit a kmeans with n_clusters
            models[model] = KMeans(n_clusters=n_clusters)
            models[model].fit(train_data)

            # get min and max distances observed in the train data (used to scale predictions later to be 0,1 range)
            train_dists = cdist(train_data, models[model].cluster_centers_)

            # store model meta data used for scaling at prediction time
            models_meta[model]['max_dist'] = np.max(np.mean(train_dists, axis=1))
            models_meta[model]['min_dist'] = np.min(np.mean(train_dists, axis=1))

        # gather recent data for model prediction once model has been trained
        if models[model]:
            x = []
            for dim in model_data_key_map[model]:
                x.extend(data_history_processed[dim][-1])

            pred_data = [x]

            # get anomaly score for latest processed data
            anomaly_score = round(np.mean(cdist(pred_data, models[model].cluster_centers_)), 2)

            # min/max normalize based on training data
            anomaly_score_scaled = (anomaly_score - models_meta[model]['min_dist']) / (models_meta[model]['max_dist'] - models_meta[model]['min_dist'])
            
            # cap anomaly score
            anomaly_score_scaled = min(anomaly_score_scaled, anomaly_score_cap)

            print(f'...anomaly score at step {n} for model {model} = {anomaly_score_scaled}')

...training model just_cpu_user_and_system at step 50 on 25 processed observations each a list of 8 numbers
...anomaly score at step 50 for model just_cpu_user_and_system = 0.1647794787210567
...anomaly score at step 51 for model just_cpu_user_and_system = 0.312464692001299
...anomaly score at step 52 for model just_cpu_user_and_system = 0.4232286019614808
...anomaly score at step 53 for model just_cpu_user_and_system = 0.4232286019614808
...anomaly score at step 54 for model just_cpu_user_and_system = 0.5155318602616321
...anomaly score at step 55 for model just_cpu_user_and_system = 0.5155318602616321
...anomaly score at step 56 for model just_cpu_user_and_system = 0.5155318602616321
...anomaly score at step 57 for model just_cpu_user_and_system = 0.5155318602616321
...anomaly score at step 58 for model just_cpu_user_and_system = 0.4047679503014505
...anomaly score at step 59 for model just_cpu_user_and_system = 0.47861055694157156
...anomaly score at step 60 for model just_cpu_user_

## Look at some objects we have used

In [9]:
# used to stored some recent raw data we need to process recent data
print(model)
data_history_raw

just_cpu_user_and_system


{'system.cpu|system': [1.0025063,
  1.0025063,
  0.7481297,
  0.7481297,
  0.7481297,
  0.7481297,
  0.7481297,
  1.25,
  1.25],
 'system.cpu|user': [1.2531328,
  1.2531328,
  1.9950125,
  1.9950125,
  1.9950125,
  1.9950125,
  1.9950125,
  2.0,
  2.0]}

In [10]:
# a history or processed observations we use when training 
# and to use most recent one when getting anomaly score
print(model)
data_history_processed

just_cpu_user_and_system


{'system.cpu|system': [[],
  [],
  [],
  [0.17],
  [0.17, 0.16],
  [0.17, 0.16, 0.17],
  [0.17, 0.16, 0.17, 0.0],
  [0.16, 0.17, 0.0, 0.0],
  [0.17, 0.0, 0.0, -0.08],
  [0.0, 0.0, -0.08, -0.09],
  [0.0, -0.08, -0.09, -0.08],
  [-0.08, -0.09, -0.08, 0.0],
  [-0.09, -0.08, 0.0, 0.33],
  [-0.08, 0.0, 0.33, 0.33],
  [0.0, 0.33, 0.33, 0.34],
  [0.33, 0.33, 0.34, 0.0],
  [0.33, 0.34, 0.0, 0.0],
  [0.34, 0.0, 0.0, -0.33],
  [0.0, 0.0, -0.33, -0.33],
  [0.0, -0.33, -0.33, -0.33],
  [-0.33, -0.33, -0.33, 0.0],
  [-0.33, -0.33, 0.0, -0.01],
  [-0.33, 0.0, -0.01, 0.0],
  [0.0, -0.01, 0.0, 0.0],
  [-0.01, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.0],
  [0.0, 0.0, 0.0, 0.17],
  [0.0, 0.0, 0.17, 0.17],
  [0.0, 0.17, 0.17, 0.17],
  [0.17, 0.17, 0.17, 0.0],
  [0.17, 0.17, 0.0, -0.17],
  [0.17, 0.0, -0.17, -0.17],
  [0.0, -0.17, -0.17, -0.16],
  [-0.17, -0.17, -0.16, 0.0],
  [-0.17, -0.16, 0.0, -0.17],
  [-0.16, 0.0, -0.17, -0.17],
  [0.0, -0.17, -0.17, -0.17],
  [-0.17, -0.17, -0.17, 0.0],
  [-0.17, -0.17, 

In [11]:
# the actual data the model is trained on
# a concatenated list of each processed vector for each metric used in the model
print(model)
train_data

just_cpu_user_and_system


[[0.08, 0.0, 0.0, 0.01, 0.0, 0.0, 0.0, 0.17],
 [0.0, 0.0, 0.01, 0.0, 0.0, 0.0, 0.17, 0.17],
 [0.0, 0.01, 0.0, 0.01, 0.0, 0.17, 0.17, 0.17],
 [0.01, 0.0, 0.01, 0.0, 0.17, 0.17, 0.17, 0.0],
 [0.0, 0.01, 0.0, -0.01, 0.17, 0.17, 0.0, -0.17],
 [0.01, 0.0, -0.01, 0.0, 0.17, 0.0, -0.17, -0.17],
 [0.0, -0.01, 0.0, 0.0, 0.0, -0.17, -0.17, -0.16],
 [-0.01, 0.0, 0.0, 0.0, -0.17, -0.17, -0.16, 0.0],
 [0.0, 0.0, 0.0, 0.33, -0.17, -0.16, 0.0, -0.17],
 [0.0, 0.0, 0.33, 0.33, -0.16, 0.0, -0.17, -0.17],
 [0.0, 0.33, 0.33, 0.33, 0.0, -0.17, -0.17, -0.17],
 [0.33, 0.33, 0.33, 0.0, -0.17, -0.17, -0.17, 0.0],
 [0.33, 0.33, 0.0, 0.0, -0.17, -0.17, 0.0, 0.0],
 [0.33, 0.0, 0.0, -0.41, -0.17, 0.0, 0.0, 0.17],
 [0.0, 0.0, -0.41, -0.42, 0.0, 0.0, 0.17, 0.17],
 [0.0, -0.41, -0.42, -0.41, 0.0, 0.17, 0.17, 0.16],
 [-0.41, -0.42, -0.41, 0.0, 0.17, 0.17, 0.16, 0.0],
 [-0.42, -0.41, 0.0, 0.0, 0.17, 0.16, 0.0, 0.0],
 [-0.41, 0.0, 0.0, 0.0, 0.16, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01],
 [0.0, 0.0, 0.

In [12]:
# the feature vector used for the most recent anomaly score
print(model)
pred_data

just_cpu_user_and_system


[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.17, 0.16]]