In [11]:
import os
import re
import json
import numpy as np
import pandas as pd
import shappack
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
import warnings
warnings.filterwarnings('ignore')

np.random.seed(123)
PLOTS_NUM = 120
TARGET_METRICS = ["cpu_usage_seconds_total", 
                                    "memory_working_set_bytes",
                                    "network_transmit_bytes_total",
                                    "network_receive_bytes_total",
                                    "fs_writes_total",
                                    "fs_reads_total"]
params = {
    "n_components": 0.8
}

In [12]:
data_files = os.listdir(path='./data')
data_dict = {}
for i, file_name in enumerate(data_files):
    f = re.sub(".*argowf-chaos-.*?-", "", file_name)
    data_dict[i] = {}
    data_dict[i]["component"] = f.split("_")[0]
    data_dict[i]["metric"] = f.split("_")[1].split("-")[1]    
    data_dict[i]["file_name"] = file_name
    data_dict[i]["file_path"] = f'./data/{file_name}'

In [13]:
class ShapPCA(object):
    def __init__(self, train_data, model=PCA(n_components=0.80)):
        self.model = model.fit(train_data)

    def predict(self, data):
        input_data = np.asarray(data)       
        output_data = self._reconstruct_data(input_data)        
        errors = np.mean((input_data - output_data)**2, axis=1)
        return np.asarray(errors)
    
    def reconstruction_error(self, data):
        input_data = np.asarray(data)       
        output_data = self._reconstruct_data(input_data)   
        recon_error = (input_data - output_data)**2
        return recon_error

    def _reconstruct_data(self, data):
        transformed_data = self.model.transform(data)  
        reconstructed_data = self.model.inverse_transform(transformed_data)
        return reconstructed_data

def read_file(file_path):
    with open(file_path) as f:
            raw_data = json.load(f)
    containers_data = raw_data["containers"]
    data_df = pd.DataFrame()
    for con in containers_data:
        if con in ["queue-master", "rabbitmq", "session-db"]:
            continue
        for metric in containers_data[con]:
            container_name = metric["container_name"]
            metric_name = metric["metric_name"].replace("container_", "")
            if metric_name not in TARGET_METRICS:
                continue
            column_name = "{}_{}".format(container_name, metric_name)
            data_df[column_name] = np.array(metric["values"], dtype=np.float)[:, 1][-PLOTS_NUM:]
    data_df = data_df.round(4).fillna(data_df.mean())
    return data_df
    
def preprocessing(data_df):
    scaler = StandardScaler()
    data_std = scaler.fit_transform(data_df)
    return data_std

def find_cause_index(shap_values):
    if len(shap_values.shape) == 1:
        cause_index = np.argsort(shap_values)[::-1]
    elif len(shap_values.shape) == 2:
        mean_shap = np.mean(np.abs(shap_values), axis=0)
        cause_index = np.argsort(mean_shap)[::-1]
    else:
        raise("The size of `shap_values` argument does not match")
    return cause_index

def find_ranking(cause_metrics, cause_index, col_list):
    if len(cause_metrics) != 1:
        raise("There are multiple causal metrics.")
    else:
        idx = col_list.index(cause_metrics[0])
        rank = list(cause_index).index(idx)
    rank += 1
    return rank

def top_k_accuracy(rank, k=1):
    if rank <= k:
        return True
    else:
        return False

In [14]:
%%time
cnt = 0
top_1 = []
top_3 = []
recon_top_1 = []
recon_top_3 = []
gbt_top_1 = []
gbt_top_3 = []
results_dict = {}

for i in data_dict:
    file_path = data_dict[i]["file_path"]
    data_df = read_file(file_path)
    col_list = list(data_df.columns)
    cause_metrics = [col for col in col_list if (data_dict[i]["component"] + "_" in col) and (data_dict[i]["metric"] in col)]
    print(file_path)
    # Preprocessing
    cnt += 1
    data_df = preprocessing(data_df)
    train_data, test_data = data_df[:116], data_df[116:]
    # SHAP based diagnosis
    model = ShapPCA(train_data, model=PCA(n_components=params["n_components"]))
    explainer = shappack.KernelExplainer(model.predict, train_data)
    idx = 4
    shap_value = explainer.shap_values(test_data[-idx:], n_workers=-1)
    cause_index = find_cause_index(shap_value)
    rank = find_ranking(cause_metrics, cause_index, col_list)
    top_1.append(top_k_accuracy(rank, k=1))
    top_3.append(top_k_accuracy(rank, k=3))
    # Reconstruction error based diagnosis 
    recon_error = np.mean(model.reconstruction_error(test_data[-idx:]), axis=0)
    cause_index_recon = np.argsort(recon_error)[::-1]
    rank_recon = find_ranking(cause_metrics, cause_index_recon, col_list)
    recon_top_1.append(top_k_accuracy(rank_recon, k=1))
    recon_top_3.append(top_k_accuracy(rank_recon, k=3))
    
    # GBT based diagnosis
    gbt = np.abs(np.mean(test_data[10:], axis=0) - np.mean(train_data, axis=0))
    cause_index_gbt = np.argsort(gbt)[::-1]
    rank_gbt = find_ranking(cause_metrics, cause_index_gbt, col_list)
    gbt_top_1.append(top_k_accuracy(rank_gbt, k=1))
    gbt_top_3.append(top_k_accuracy(rank_gbt, k=3))
    
    result = {}
    result["file_name"] = data_dict[i]["file_name"]
    result["component"] = data_dict[i]["component"]
    result["metric"] = data_dict[i]["metric"]
    result["rank"] = rank
    result["rank_recon"] = rank_recon
    results_dict[i] = result
print(f"Top 1: {round(sum(top_1)/len(top_1), 3)}, Recon Top1: {round(sum(recon_top_1)/len(recon_top_1), 3)}, GBT Top 1: {round(sum(gbt_top_1)/len(gbt_top_1), 3)}")
print(f"Top 3: {round(sum(top_3)/len(top_3), 3)}, Recon Top3: {round(sum(recon_top_3)/len(recon_top_3), 3)}, GBT Top 3: {round(sum(gbt_top_3)/len(gbt_top_3), 3)}")

./data/2021-08-18-argowf-chaos-b2qdj-user_pod-memory-hog_0.json
./data/2021-08-01-argowf-chaos-g85hs-carts-db_pod-cpu-hog_0.json
./data/2021-08-01-argowf-chaos-g85hs-catalogue-db_pod-cpu-hog_0.json
./data/2021-08-18-argowf-chaos-b2qdj-front-end_pod-cpu-hog_4.json
./data/2021-08-18-argowf-chaos-b2qdj-catalogue_pod-cpu-hog_4.json
./data/2021-08-01-argowf-chaos-g85hs-orders-db_pod-memory-hog_0.json
./data/2021-08-18-argowf-chaos-b2qdj-carts-db_pod-cpu-hog_2.json
./data/2021-08-18-argowf-chaos-b2qdj-front-end_pod-cpu-hog_3.json
./data/2021-08-18-argowf-chaos-b2qdj-carts-db_pod-cpu-hog_1.json
./data/2021-08-01-argowf-chaos-g85hs-orders_pod-cpu-hog_0.json
./data/2021-08-18-argowf-chaos-b2qdj-orders_pod-memory-hog_0.json
./data/2021-08-01-argowf-chaos-g85hs-user_pod-memory-hog_0.json
./data/2021-08-18-argowf-chaos-b2qdj-user_pod-memory-hog_3.json
./data/2021-08-18-argowf-chaos-b2qdj-carts-db_pod-memory-hog_0.json
./data/2021-08-18-argowf-chaos-b2qdj-orders_pod-cpu-hog_1.json
./data/2021-08-18