# Changepoint Detection for the OSDI 2018 paper "Taming Performance Variability"#

In [1]:
import csv
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import datetime as dt
import os

In [3]:
import cpd

In [4]:
dict = {}

source_dir = './cloudlab-benchmarks-osdi18-data-and-nodebooks/data/raw-data/'
for f in os.listdir(source_dir):
    if ".csv" in f:
        dict[os.path.splitext(f)[0]] = pd.read_csv(source_dir + f)

In [5]:
# dict['disk_info'].T

In [6]:
# dict['disk_results'].T

In [7]:
# dict['env_info'].T

In [8]:
# dict['fio_info'].T

In [9]:
# dict['iperf3_info'].T

In [10]:
# dict['iperf3_results'].T

In [11]:
# dict['mem_results'].T

In [12]:
# dict['membench_info'].T

In [13]:
# dict['network_info'].T

In [14]:
# dict['ping_info'].T

In [15]:
# dict['ping_results'].T

In [16]:
# dict['stream_info'].T

In [17]:
raw_disk = pd.merge(dict['disk_results'], dict['env_info'], 
                on=['run_uuid','nodeid','nodeuuid','timestamp'])
raw_disk['disk_name'] = raw_disk['device'].apply(lambda x: x.rstrip(string.digits) if 'nvm' not in x else x[:-2])
raw_disk['disk_name'].unique()
raw_disk = pd.merge(raw_disk, dict['disk_info'], 
                on=['run_uuid','timestamp','nodeuuid','disk_name','nodeid'])
raw_disk['disk_size'] = raw_disk['disk_size'].apply(lambda x: x.lstrip())

In [18]:
# raw_disk.T

In [19]:
exclude_ids = []
for idx, grp in raw_disk.groupby(["site", "hw_type", "device", "disk_type", "disk_model", "disk_size"]):
#     print(idx, len(grp))
    if len(grp)< 200:
        exclude_ids.extend(grp.index.values)
disk = raw_disk.drop(exclude_ids, inplace=False)
# Exclude measurements on or after April 2, 2018
disk = disk[disk["timestamp"] <= 1522636071]
disk = disk[disk["run_success"] != 0]

len(disk)

139864

In [20]:
# print(disk.hw_type.value_counts())

In [21]:
df = disk
factor_list = ["testname","device","iodepth"]
metric = "mean"

dest_dir = "./disk_cpd"
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

lthreshold_list = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

meta_df = pd.DataFrame(columns=["hw_type", "total_measurements", "configurations", 
                                    "start_timestamp", "end_timestamp"])
indiv_cps = pd.DataFrame(columns = ["hw_type", "lthreshold"] + 
                  factor_list +  
                  ["timestamp", "percent_change"])
indiv_durations = pd.DataFrame(columns = ["hw_type", "lthreshold"] + 
                                     factor_list +  
                                     ["segment_duration"])


cpd.cpd_aggr(df, factor_list, metric, meta_df, lthreshold_list, indiv_cps, indiv_durations,dest_dir)


Processing hw_type: c220g1
Processed configs: 192
Processed measurements: 19000

Processing hw_type: c220g2
Processed configs: 384
Processed measurements: 55560

Processing hw_type: c6320
Processed configs: 512
Processed measurements: 67416

Processing hw_type: c8220
Processed configs: 640
Processed measurements: 95288

Processing hw_type: m400
Processed configs: 704
Processed measurements: 123816

Processing hw_type: m510
Processed configs: 768
Processed measurements: 139864


In [22]:
cpd = {}
cpd["disk"] = {}
result_dir = "disk_cpd"
for f in os.listdir(result_dir):
    if f[:2] != "._":
        print("Loading:%s/%s" %(result_dir,f))
        df = pd.read_csv("%s/%s" %(result_dir,f))
        print(len(df))
        
        if "aggr" in f:
            lthreshold = float(f.split("-")[1])
            df["lthreshold"] = lthreshold
            if "aggr" not in cpd["disk"].keys():
                cpd["disk"]["aggr"] = pd.DataFrame()
            cpd["disk"]["aggr"] = pd.concat([cpd["disk"]["aggr"], df], sort=False)
            
        if "indiv" in f:
            if "indiv" not in cpd["disk"].keys():
                cpd["disk"]["indiv"] = pd.DataFrame()
                cpd["disk"]["indiv"] = pd.concat([cpd["disk"]["indiv"], df], sort=False)

        if "meta" in f:
            cpd["disk"]["meta"] = df
            
        if "segment-durations" in f:
            cpd["disk"]["segment-durations"] = df

Loading:disk_cpd/indiv-0.70
25
Loading:disk_cpd/aggr-1.00
1160
Loading:disk_cpd/indiv-0.40
5
Loading:disk_cpd/aggr-0.30
150
Loading:disk_cpd/aggr-0.90
986
Loading:disk_cpd/aggr-0.70
289
Loading:disk_cpd/indiv-1.00
76
Loading:disk_cpd/aggr-0.40
80
Loading:disk_cpd/meta
6
Loading:disk_cpd/indiv-0.30
18
Loading:disk_cpd/indiv-0.90
57
Loading:disk_cpd/indiv-0.80
39
Loading:disk_cpd/aggr-0.50
112
Loading:disk_cpd/segment-durations
995
Loading:disk_cpd/aggr-0.60
235
Loading:disk_cpd/aggr-0.80
759
Loading:disk_cpd/indiv-0.50
10
Loading:disk_cpd/indiv-0.60
15


In [23]:
cpd["disk"]["segment-durations"].lthreshold.value_counts()

1.0    172
0.9    153
0.8    135
0.7    121
0.6    111
0.5    106
0.4    101
0.3     96
Name: lthreshold, dtype: int64

In [24]:
cpd["disk"]["aggr"].hw_type.value_counts()

c220g2    1208
c220g1     720
c8220      720
m510       521
c6320      370
m400       232
Name: hw_type, dtype: int64

In [25]:
cpd["disk"]["meta"]

Unnamed: 0,hw_type,total_measurements,configurations,start_timestamp,end_timestamp
0,c220g1,19000,24,1495272056,1522621977
1,c220g2,36560,24,1495272065,1522622172
2,c6320,11856,16,1495272018,1522565352
3,c8220,27872,16,1495457110,1522630140
4,m400,28528,8,1495271912,1522627922
5,m510,16048,8,1495476412,1522627939


In [26]:
count_df = pd.DataFrame(columns=[ "Disk: CP #", 
                                 "Disk: Conf #", 
                                 "Disk: Per Conf"])
for lthreshold in cpd["disk"]["indiv"]["lthreshold"].sort_values().unique():

  disk_cpd_c = len(cpd["disk"]["indiv"][(cpd["disk"]["indiv"]["lthreshold"] == lthreshold)])

  disk_conf_c = cpd["disk"]["meta"].configurations.sum() * 1.0

  disk_r = disk_cpd_c/disk_conf_c

  count_df.loc[lthreshold] = (disk_cpd_c, 
                              disk_conf_c,
                              disk_r)


display(count_df)

Unnamed: 0,Disk: CP #,Disk: Conf #,Disk: Per Conf
0.7,25.0,96.0,0.260417


In [27]:
counts = count_df[["Disk: CP #"]].copy()
counts["Total #"] = counts.sum(axis=1)

print("This is how many changepoints we have found for different values of lthreshold:")
display(counts)

This is how many changepoints we have found for different values of lthreshold:


Unnamed: 0,Disk: CP #,Total #
0.7,25.0,25.0
