### Experience

##### Procedure:
1. Build metric names for each service name
2. Request metrics (with metric names) from the time-series database
3. Merge metrics (Outer Join - Keep all values but result in missing values(NaN)) --> `[Data Structure]` Each service has a data frame ex.: {'service_name_1': df1, 'service_name_2': df2, ..}
4. Get rid of missing values (Linear Interpolation)
5. Apply Unsupervised Learning Algorithm (Isolation Forest) to every data frame and store the result --> `[Data Structure]` Each service has an anomaly points data frame ex.: {'service_name_1': adf1, 'service_name_2': adf2, ...}
6. Merge anomaly points data frames (Simple concatenation) --> `[Data Structure]` data frame
7. Sort data frame by time (timestamp)
8. Add new column `time_diff` to the data frame with the difference between consecutive timestamps
9. Add new column `time_diff_check` to the data frame with values: 1 if <= given `timestamp_diff_threshold`, 0 otherwise
10. Filter and remove rows with `time_diff_check` with 0 --> `[DataFrame]` time; service_name; etc

Lack: Check if services are neighbours in time frames (Currently we are checking manually using Zipkin)

In [1]:
from os import path
import sys
sys.path.append(path.abspath('/mnt/B6C8BAECC8BAAA4F/André/UC/Tese/MScThesis/Graphy'))

import graphy

In [2]:
from graphy.utils import zipkin

service_names = zipkin.get_services()

service_names

['api_com',
 'apig_console',
 'batch_console',
 'cinder-api-cascaded',
 'cinder-scheduler-cascaded',
 'cinder-volume-cascaded',
 'cluster-manager',
 'evs',
 'hss',
 'mysql',
 'neutron-server-cascaded',
 'neutron-server-cascading',
 'nova-api-cascaded',
 'nova-api-cascading',
 'nova-compute-cascaded',
 'nova-conductor-cascaded',
 'nova-scheduler-cascaded',
 'vpc-service']

In [3]:
from graphy.db import opentsdb

metric_names = list()

for service_name in service_names:
    metric_name_1 = 'huawei.call_count_in.{}'.format(service_name)
    metric_name_2 = 'huawei.call_count_out.{}'.format(service_name)
    metric_name_3 = 'huawei.response_time_avg.{}'.format(service_name)
    metric_names.append((metric_name_1, metric_name_2, metric_name_3))

metric_names

[('huawei.call_count_in.api_com',
  'huawei.call_count_out.api_com',
  'huawei.response_time_avg.api_com'),
 ('huawei.call_count_in.apig_console',
  'huawei.call_count_out.apig_console',
  'huawei.response_time_avg.apig_console'),
 ('huawei.call_count_in.batch_console',
  'huawei.call_count_out.batch_console',
  'huawei.response_time_avg.batch_console'),
 ('huawei.call_count_in.cinder-api-cascaded',
  'huawei.call_count_out.cinder-api-cascaded',
  'huawei.response_time_avg.cinder-api-cascaded'),
 ('huawei.call_count_in.cinder-scheduler-cascaded',
  'huawei.call_count_out.cinder-scheduler-cascaded',
  'huawei.response_time_avg.cinder-scheduler-cascaded'),
 ('huawei.call_count_in.cinder-volume-cascaded',
  'huawei.call_count_out.cinder-volume-cascaded',
  'huawei.response_time_avg.cinder-volume-cascaded'),
 ('huawei.call_count_in.cluster-manager',
  'huawei.call_count_out.cluster-manager',
  'huawei.response_time_avg.cluster-manager'),
 ('huawei.call_count_in.evs',
  'huawei.call_count_o

In [4]:
import pandas as pd

start_timestamp = 1530057600
end_timestamp = 1530316800

dataframes = dict()

for metric_name in metric_names:
    metrics_0 = opentsdb.get_metrics(metric_name[0], start_timestamp, end_timestamp)
    metrics_1 = opentsdb.get_metrics(metric_name[1], start_timestamp, end_timestamp)
    metrics_2 = opentsdb.get_metrics(metric_name[2], start_timestamp, end_timestamp)

    service_name = metric_name[0].split('.')[-1]
    
    try:
        df_0 = pd.DataFrame(metrics_0.items(), columns=['time', 'call_count_in'])
        df_1 = pd.DataFrame(metrics_1.items(), columns=['time', 'call_count_out'])
        df_2 = pd.DataFrame(metrics_2.items(), columns=['time', 'response_time_avg'])

        df = None
        df = pd.merge(df_0, df_1, how='outer')
        df = pd.merge(df, df_2, how='outer')
    
        dataframes[service_name] = df.copy()
    except Exception as ex:
        metrics_lack = list()
        if metrics_0 is None:
            metrics_lack.append('call_count_in')
        if metrics_1 is None:
            metrics_lack.append('call_count_out')
        if metrics_2 is None:
            metrics_lack.append('response_time_avg')
        print('{} lacks information in {}'.format(service_name, metrics_lack))

dataframes

apig_console lacks information in ['call_count_in']
batch_console lacks information in ['call_count_in']
cluster-manager lacks information in ['call_count_in']
evs lacks information in ['call_count_out']
hss lacks information in ['call_count_in']
mysql lacks information in ['call_count_out']
neutron-server-cascaded lacks information in ['call_count_out']
nova-scheduler-cascaded lacks information in ['call_count_out']


{'api_com':            time  call_count_in  call_count_out  response_time_avg
 0    1530142200            1.0             541      184523.203125
 1    1530143100            1.0             185      200106.343750
 2    1530143475            1.0              70      210183.750000
 3    1530143490            1.0              70      210183.750000
 4    1530143700            1.0             185      200106.343750
 5    1530178200            6.0             306      313676.406250
 6    1530178275            4.0              42      211469.921875
 7    1530178290            6.0             151      287357.187500
 8    1530178305            2.0             109      303640.093750
 9    1530178500            6.0             238      271398.093750
 10   1530179100            6.0             238      271398.093750
 11   1530185400            9.0            1034      191995.031250
 12   1530185700            3.0             145      212171.453125
 13   1530185730            3.0              95    

In [5]:
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot as plt

outliers_fraction = 0.01

def isolation_forest(data_frame):
    rng = np.random.RandomState(0)

    data = data_frame[['call_count_in', 'call_count_out', 'response_time_avg']]
    scaler = StandardScaler()
    np_scaled = scaler.fit_transform(data)
    data = pd.DataFrame(np_scaled)

    model =  IsolationForest(contamination=outliers_fraction, n_jobs=-1, random_state=rng)
    model.fit(data)
    data_frame['anomaly'] = pd.Series(model.predict(data))

    # fig, ax = plt.subplots(figsize=(10,6))
    # a = data_frame.loc[data_frame['anomaly'] == -1, ['time', 'call_count_in']] #anomaly
    # 
    # ax.plot(data_frame['time'], data_frame['call_count_in'], color='blue', label='Normal')
    # ax.scatter(a['time'],a['call_count_in'], color='red', label='Anomaly')
    # plt.xlabel('Timestamp')
    # plt.ylabel('Call count')
    # plt.title('Call Count [IN] Anomalies')
    # plt.legend()
    # plt.show()
    # 
    # fig, ax = plt.subplots(figsize=(10,6))
    # a = data_frame.loc[data_frame['anomaly'] == -1, ['time', 'call_count_out']] #anomaly
    # 
    # ax.plot(data_frame['time'], data_frame['call_count_out'], color='blue', label='Normal')
    # ax.scatter(a['time'],a['call_count_out'], color='red', label='Anomaly')
    # plt.xlabel('Timestamp')
    # plt.ylabel('Call count')
    # plt.title('Call Count [OUT] Anomalies')
    # plt.legend()
    # plt.show()
    # 
    # fig, ax = plt.subplots(figsize=(10,6))
    # a = data_frame.loc[data_frame['anomaly'] == -1, ['time', 'response_time_avg']] #anomaly
    # 
    # ax.plot(data_frame['time'], data_frame['response_time_avg'], color='blue', label='Normal')
    # ax.scatter(a['time'],a['response_time_avg'], color='red', label='Anomaly')
    # plt.xlabel('Timestamp')
    # plt.ylabel('Response time (ms)')
    # plt.title('Average Response Time Anomalies')
    # plt.legend()
    # plt.show()

    # Print anomaly detected in isolation forests algorithm
    anomaly_data = data_frame.loc[data_frame['anomaly'] == -1]
    anomaly_values = anomaly_data[['time', 'call_count_in', 'call_count_out', 'response_time_avg']]
    
    return anomaly_values

In [6]:
# Fill the missing values using a specific method
def fill_missing_values(data_frame, method='linear', na_values=0):
    return data_frame.copy().interpolate(method=method).fillna(na_values)

# Add new column with a specific name and a specific value
def add_new_column(data_frame, colum_name: str, value):
    data_frame_cpy = data_frame.copy()
    data_frame_cpy[colum_name] = value
    return data_frame_cpy

# Merge multiple data frames
def merge_data_frames(data_frames: list()):
    return pd.concat(data_frames)

data_frames_list = list()

for service_name in dataframes.keys():
    # print('service name:', service_name)
    dataframes[service_name] = fill_missing_values(dataframes[service_name])
    # print('missing values:\n{}'.format(dataframes[service_name].isna().sum()))
    anomaly_values = isolation_forest(dataframes[service_name])
    anomaly_values = add_new_column(anomaly_values, 'service_name', service_name)
    # print('anomaly values:\n{}'.format(anomaly_values))
    data_frames_list.append(anomaly_values.copy())
    
data_frames = merge_data_frames(data_frames_list)

data_frames['time'] = pd.to_numeric(data_frames['time']) 
data_frames = data_frames.sort_values('time')

data_frames['time_diff'] = data_frames['time'].diff(periods=1)

# Filter time_diff NaN values.
## data_frames = data_frames.loc[data_frames['time_diff'].notnull()]

# print('Before filter:')
# print(data_frames.info())
# print(data_frames[['time', 'service_name', 'time_diff']])

# Filter by time_diff (values in milliseconds)
# # time_diff_threshold = 1 * 60 * 60 # hour in seconds
# # time_diff_threshold = 30 * 60 # half hour in seconds
time_diff_threshold = 60 # minute in seconds
data_frames['time_diff_check'] = np.where(data_frames['time_diff'] <= time_diff_threshold, 1, 0)

data_frames['time_diff_check_next'] = data_frames['time_diff_check'].shift(-1) == 1

data_frames['time'] = pd.to_datetime(data_frames['time'], unit='s')

# print('After filter:')
# print(data_frames.info())
# print(data_frames[['time', 'time_diff', 'time_diff_check', 'time_diff_check_next']])

data_frames = data_frames.query('time_diff_check == 1 or time_diff_check_next == True')

# print(data_frames.info())
print('Anomalous services:\n',data_frames[['time', 'service_name']])



Anomalous services:
                    time               service_name
7   2018-06-28 14:30:00      nova-compute-cascaded
347 2018-06-28 14:30:00         nova-api-cascading
234 2018-06-28 14:30:00   neutron-server-cascading
274 2018-06-28 14:39:15                    api_com
275 2018-06-28 14:39:30                    api_com
329 2018-06-28 20:30:00   neutron-server-cascading
453 2018-06-28 20:30:00         nova-api-cascading
239 2018-06-29 01:30:00        cinder-api-cascaded
195 2018-06-29 01:30:00  cinder-scheduler-cascaded
97  2018-06-29 03:15:00                    api_com
206 2018-06-29 03:15:00  cinder-scheduler-cascaded
63  2018-06-29 03:30:00          nova-api-cascaded
182 2018-06-29 03:30:00     cinder-volume-cascaded
208 2018-06-29 03:30:00  cinder-scheduler-cascaded
252 2018-06-29 03:30:00        cinder-api-cascaded
69  2018-06-29 03:30:00    nova-conductor-cascaded
99  2018-06-29 03:30:00                    api_com
180 2018-06-29 03:55:00                vpc-service
185 2018-0

