## Query data

In [1]:
import os
from prometheus_api_client import (
    PrometheusConnect,
    MetricSnapshotDataFrame,
    MetricRangeDataFrame
)
import sys
import datetime as dt
import pandas as pd
import pytz
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [2]:
prom = PrometheusConnect(url="http://192.168.50.113:9090", disable_ssl=True)

query_info_df = pd.read_csv("query_info.csv")

# read text file
with open("query.txt") as f:
    query_list = f.readlines()

# replace substring of each line in query_list
query_list=list(map(lambda x: x.replace('$tidb_cluster', 'tidb-cluster'), query_list))
query_list=list(map(lambda x: x.replace('$instance', 'basic-pd-0'), query_list))

In [3]:
# set query time
start_time = dt.datetime(2022, 5, 4, 13, 00, 00)+dt.timedelta(hours=8)
# start_time = end_time-dt.timedelta(hours=6)

In [4]:
for i in range(len(query_list)):
    query_df = pd.DataFrame()
    for t in range(7):
        mstart_time = start_time+dt.timedelta(hours=3)*t
        mend_time = mstart_time+dt.timedelta(hours=3)-dt.timedelta(seconds=1)
        metric_data = prom.custom_query_range(
            query_list[i],
            start_time=mstart_time,
            end_time=mend_time,
            step=1
        )

        metric_df = pd.DataFrame(columns=['timestamp'])
        for m in metric_data:
            if query_info_df.metric[i] == 'None':
                col_name = "None"
            else:
                if m['metric'] == {} and len(metric_data) > 1:
                    continue
                sub_metrics = query_info_df.metric[i].split("+")
                col_name = "_".join(
                    list(map(lambda x: m['metric'][x], sub_metrics)))
                col_name = col_name.replace('-', '_')
            temp_df = pd.DataFrame(m['values'], columns=[
                                   'timestamp', col_name])
            metric_df = pd.merge(metric_df, temp_df,
                                 on='timestamp', how='outer')

        query_df = pd.concat([query_df, metric_df])

    save_path = os.path.join(os.pardir, "data", "train", '_'.join(
        (query_info_df.name[i]).split())+'.csv')
    query_df.to_csv(save_path, index=False)


## Combine data

In [5]:
combined_df = pd.DataFrame(columns=['timestamp'])
# walk through directory
for root, dirs, files in os.walk(os.path.join(os.pardir, "data", "train")):
    for file in files:
        if file.endswith(".csv"):
            temp_df = pd.read_csv(os.path.join(root, file))
            if temp_df.empty:
                continue
            file_name = file.split('.')[0]
            # rename temp_df columns
            temp_df.columns = ['timestamp'] + list(
                map(lambda x: file_name+'__'+x, temp_df.columns[1:]))
            combined_df = pd.merge(combined_df, temp_df,
                                   on='timestamp', how='outer')
combined_df.set_index('timestamp')
# drop columns all nan
combined_df.dropna(axis=1, how='all', inplace=True)
combined_df.interpolate(inplace=True, limit=60, method='linear')
## fill nan by 0
combined_df.fillna(0, inplace=True)
combined_df = combined_df.loc[:, (combined_df != 0).any(axis=0)]

combined_df.to_csv(os.path.join(
    os.pardir, "output", 'train.csv'), index=False)


In [6]:
temp_df=combined_df.copy()
temp_df.drop(['timestamp'], axis=1, inplace=True)
# minus by mean
temp_df = temp_df - temp_df.mean()
blip_df=pd.DataFrame()
# chaos_state_df = pd.read_csv(os.path.join(
#     os.pardir, "output", 'chaos_state.csv'))
# blip_df["chaos_state"]=chaos_state_df["chaos_state"]

# discritize each single column by kmeans cluster
for col in temp_df.columns:
    # if col=="chaos__event_type":
    #     blip_df[col]=temp_df[col]
    #     continue
    kmeans = KMeans(n_clusters=15, random_state=0)
    kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
    blip_df[col]=kmeans.labels_

blip_df.to_csv(os.path.join(os.pardir, "output", 'train_blip.csv'), index=False)

  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  blip_df[col]=kmeans.labels_
  blip_df[col]=kmeans.labels_
  kmeans.fit(temp_df[col].to_numpy().reshape(-1, 1))
  blip_df[col]=kmeans.labels_
  blip_df

In [7]:
# copy combined_df
notears_raw_df = combined_df.copy()
# standard normalization
notears_raw_df = (notears_raw_df - notears_raw_df.mean()) / \
    (notears_raw_df.std())
notears_raw_df.dropna(axis=1, how='all', inplace=True)
notears_raw_df.to_csv(os.path.join(os.pardir, "output",
                      'notears_raw.csv'), index=False)

notears_df = notears_raw_df.drop(['timestamp'], axis=1)
notears_df.to_csv(os.path.join(os.pardir, "output",
                  'notears.csv'), index=False, header=False)


In [None]:
data_np=np.loadtxt(os.path.join(os.pardir, "output", 'notears.csv'), delimiter=',')

In [None]:
data_np.shape

In [None]:
np.argwhere(np.isinf(data_np))

In [None]:
combined_df

In [None]:
np.where(pd.isnull(new_df))

In [5]:
df=pd.read_csv(os.path.join(os.pardir, "output", "noChaos.csv"))

In [6]:
df.shape

(75600, 161)

In [None]:
df.plot(x=df.columns[0], y=df.columns[1:])