In [1]:
import os
import pandas as pd
import numpy as np
import time
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


### Load data ###

In [2]:
cpd = {}

for dataset in ["CPU", "Memory", "Disk"]:

    results_dir = "./%s_results" % dataset
    if dataset not in cpd.keys():
        cpd[dataset] = {} 

    for f in os.listdir("%s" % results_dir):
        # Exclude hidden files (most likely thumbnails)
        if f[:2] != "._":
            print("Loading: %s/%s" % (results_dir, f))
            df = pd.read_csv("%s/%s" % (results_dir, f))
            print(len(df))

            if "aggr" in f:
                # Extract CPD threshold from the filename 
                lthreshold = float(f.split("-")[1])

                df["lthreshold"] = lthreshold
                if "aggr" not in cpd[dataset].keys():
                    cpd[dataset]["aggr"] = pd.DataFrame()
                cpd[dataset]["aggr"] = pd.concat([cpd[dataset]["aggr"], df], sort=False)

            if "indiv" in f:
                if "indiv" not in cpd[dataset].keys():
                    cpd[dataset]["indiv"] = pd.DataFrame()
                cpd[dataset]["indiv"] = pd.concat([cpd[dataset]["indiv"], df], sort=False)

            if "meta" in f:
                cpd[dataset]["meta"] = df

            if "segment-durations" in f:
                cpd[dataset]["segment-durations"] = df

Loading: ./CPU_results/indiv-0.70
283
Loading: ./CPU_results/aggr-1.00
4034
Loading: ./CPU_results/indiv-0.40
147
Loading: ./CPU_results/aggr-0.30
354
Loading: ./CPU_results/aggr-0.90
3951
Loading: ./CPU_results/aggr-0.70
3068
Loading: ./CPU_results/indiv-1.00
410
Loading: ./CPU_results/aggr-0.40
930
Loading: ./CPU_results/meta
11
Loading: ./CPU_results/indiv-0.30
49
Loading: ./CPU_results/indiv-0.90
363
Loading: ./CPU_results/indiv-0.80
325
Loading: ./CPU_results/aggr-0.50
1944
Loading: ./CPU_results/segment-durations
6725
Loading: ./CPU_results/aggr-0.60
2459
Loading: ./CPU_results/aggr-0.80
3565
Loading: ./CPU_results/indiv-0.50
193
Loading: ./CPU_results/indiv-0.60
235
Loading: ./Memory_results/indiv-0.70
1406
Loading: ./Memory_results/aggr-1.00
6241
Loading: ./Memory_results/indiv-0.40
925
Loading: ./Memory_results/aggr-0.30
2656
Loading: ./Memory_results/aggr-0.90
6002
Loading: ./Memory_results/aggr-0.70
5420
Loading: ./Memory_results/indiv-1.00
1784
Loading: ./Memory_results/agg

In [3]:
count_df = pd.DataFrame(columns=["CPU: CP #", "Mem: CP #", "Disk: CP #"])
for lthreshold in cpd["CPU"]["indiv"]["lthreshold"].sort_values().unique():
    cpu_cpd_c = len(cpd["CPU"]["indiv"][(cpd["CPU"]["indiv"]["lthreshold"] == lthreshold)])
    
    mem_cpd_c = len(cpd["Memory"]["indiv"][(cpd["Memory"]["indiv"]["lthreshold"] == lthreshold)])
    
    disk_cpd_c = len(cpd["Disk"]["indiv"][(cpd["Disk"]["indiv"]["lthreshold"] == lthreshold)])
    
    count_df.loc[lthreshold] = (cpu_cpd_c, mem_cpd_c, disk_cpd_c)
    
count_df["Total #"] = count_df.sum(1)
count_df.loc['Total',:] = count_df.sum(0)
display(count_df)

Unnamed: 0,CPU: CP #,Mem: CP #,Disk: CP #,Total #
0.3,49,492,42,583.0
0.4,147,925,76,1148.0
0.5,193,1113,108,1414.0
0.6,235,1263,134,1632.0
0.7,283,1406,160,1849.0
0.8,325,1529,191,2045.0
0.9,363,1630,215,2208.0
1.0,410,1784,245,2439.0
Total,2005,10142,1171,13318.0


In [4]:
memory = cpd['Memory']['indiv']
memory.columns

Index(['hw_type', 'lthreshold', 'testname', 'dvfs', 'socket_num', 'timestamp',
       'percent_change'],
      dtype='object')

In [5]:
cpu = cpd['CPU']['indiv']
cpu.columns

Index(['hw_type', 'lthreshold', 'testname', 'total_threads', 'dvfs',
       'socket_num', 'timestamp', 'percent_change'],
      dtype='object')

In [6]:
disk = cpd['Disk']['indiv']
disk.columns

Index(['hw_type', 'lthreshold', 'testname', 'device', 'iodepth', 'timestamp',
       'percent_change'],
      dtype='object')

In [7]:
class Data:
    X = pd.DataFrame()
    y = pd.DataFrame()
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def xgb(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=8)
        xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.7, learning_rate = 0.1,
                                  max_depth = 5, subsample = 0.9, n_estimators = 1000)
        xg_reg.fit(X_train,y_train)
        predict_xgb = xg_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predict_xgb))
        print("RMSE(XgBoost): %f" %(rmse/24/3600),"days")
    def lgb(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=8)
        lgb_reg = lgb.LGBMRegressor(objective = 'mse', colsample_bytree = 0.7, learning_rate = 0.1,
                                    max_depth = 5, subsample = 0.9, n_estimators = 1000, random_state = 3, num_leaves = 30)
        lgb_reg.fit(X_train,y_train)
        predict_lgb = lgb_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predict_lgb))
        print("RMSE(LightGBM): %f" %(rmse/24/3600),"days")
    def cb(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=8)
        cb_reg = cb.CatBoostRegressor(learning_rate = 0.1, max_depth = 5, verbose = 0, iterations = 1000,
                                      early_stopping_rounds = 200, eval_metric = 'RMSE')
        cb_reg.fit(X_train,y_train)
        predict_cb = cb_reg.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, predict_cb))
        print("RMSE(CatBoost): %f" %(rmse/24/3600),"days")

### Memory ###

In [8]:
X = memory[['hw_type', 'lthreshold', 'testname', 'dvfs', 'socket_num', 'percent_change']]
y = memory[['timestamp']]
categorical_cols = X.columns[X.dtypes==object].tolist()
le = LabelEncoder()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

Memory = Data(X,y)
Memory.xgb()
Memory.lgb()
Memory.cb()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


RMSE(XgBoost): 100.925054 days
RMSE(LightGBM): 98.805006 days
RMSE(CatBoost): 99.801512 days


### CPU###

In [9]:
X = cpu[['hw_type', 'lthreshold', 'testname', 'total_threads', 'dvfs', 'socket_num', 'percent_change']]
y = cpu[['timestamp']]
categorical_cols = X.columns[X.dtypes==object].tolist()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

Cpu = Data(X,y)
Cpu.xgb()
Cpu.lgb()
Cpu.cb()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


RMSE(XgBoost): 59.408048 days
RMSE(LightGBM): 53.941582 days
RMSE(CatBoost): 49.244405 days


### Disk ###

In [10]:
X = disk[['hw_type', 'lthreshold', 'testname', 'device', 'iodepth', 'percent_change']]
y = disk[['timestamp']]
categorical_cols = X.columns[X.dtypes==object].tolist()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

Disk = Data(X,y)
Disk.xgb()
Disk.lgb()
Disk.cb()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


RMSE(XgBoost): 122.986788 days
RMSE(LightGBM): 116.334132 days
RMSE(CatBoost): 97.401529 days


### Memory+CPU###

In [11]:
memory_cpu = pd.merge(memory,cpu,how = 'outer',on = ['hw_type', 'lthreshold', 'testname', 'dvfs', 'socket_num', 'timestamp', 'percent_change'])

In [12]:
memory_cpu

Unnamed: 0,hw_type,lthreshold,testname,dvfs,socket_num,timestamp,percent_change,total_threads
0,c220g1,0.7,add,no,0,1515202534,-3.772623,
1,c220g1,0.7,add,no,0,1540060856,9.972447,
2,c220g1,0.7,add,no,0,1543671468,-10.811279,
3,c220g1,0.7,add,no,1,1515202534,-3.804811,
4,c220g1,0.7,add,no,1,1540060985,-1.383751,
...,...,...,...,...,...,...,...,...
12142,xl170,0.6,SP,no,0,1572632379,-2.716910,20.0
12143,xl170,0.6,SP,yes,0,1572632379,-3.021414,20.0
12144,xl170,0.6,UA,yes,0,1555957133,-0.239028,1.0
12145,xl170,0.6,UA,yes,0,1554315514,-0.241847,20.0


In [13]:
X = memory_cpu[['hw_type', 'lthreshold', 'testname', 'dvfs', 'socket_num', 'total_threads', 'percent_change']]
y = memory_cpu[['timestamp']]
categorical_cols = X.columns[X.dtypes==object].tolist()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

MC = Data(X,y)
MC.xgb()
MC.lgb()
MC.cb()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


RMSE(XgBoost): 103.078309 days
RMSE(LightGBM): 94.128242 days
RMSE(CatBoost): 103.060766 days


### Memory+Disk###

In [14]:
memory_disk = pd.merge(memory,disk,how = 'outer',on = ['hw_type', 'lthreshold', 'testname', 'timestamp', 'percent_change'])

In [15]:
memory_disk

Unnamed: 0,hw_type,lthreshold,testname,dvfs,socket_num,timestamp,percent_change,device,iodepth
0,c220g1,0.7,add,no,0.0,1515202534,-3.772623,,
1,c220g1,0.7,add,no,0.0,1540060856,9.972447,,
2,c220g1,0.7,add,no,0.0,1543671468,-10.811279,,
3,c220g1,0.7,add,no,1.0,1515202534,-3.804811,,
4,c220g1,0.7,add,no,1.0,1540060985,-1.383751,,
...,...,...,...,...,...,...,...,...,...
11308,xl170,0.6,read,,,1535869242,-0.864026,/dev/sda4,1.0
11309,xl170,0.6,read,,,1555957198,-11.289161,/dev/sda4,1.0
11310,xl170,0.6,read,,,1561854084,-3.503756,/dev/sda4,1.0
11311,xl170,0.6,write,,,1555957198,-12.762087,/dev/sda4,1.0


In [44]:
X = memory_disk[['hw_type', 'lthreshold', 'testname', 'dvfs', 'socket_num', 'percent_change', 'device', 'iodepth']]
y = memory_disk[['timestamp']]
categorical_cols = X.columns[X.dtypes==object].tolist()
X[categorical_cols] = X[categorical_cols].fillna("")
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

MD = Data(X,y)
MD.xgb()
MD.lgb()
MD.cb()

RMSE(XgBoost): 108.639575 days
RMSE(LightGBM): 101.973588 days
RMSE(CatBoost): 109.255086 days


### CPU+Disk###

In [45]:
cpu_disk = pd.merge(cpu,disk,how='outer',on = ['hw_type', 'lthreshold', 'testname', 'timestamp', 'percent_change'])

In [46]:
cpu_disk

Unnamed: 0,hw_type,lthreshold,testname,total_threads,dvfs,socket_num,timestamp,percent_change,device,iodepth
0,c220g1,0.7,BT,1.0,no,0.0,1543066686,0.214888,,
1,c220g1,0.7,BT,16.0,no,0.0,1569976740,1.825371,,
2,c220g1,0.7,CG,1.0,no,0.0,1543736380,4.477789,,
3,c220g1,0.7,CG,16.0,no,0.0,1565743135,0.962987,,
4,c220g1,0.7,CG,16.0,no,1.0,1565743135,0.337155,,
...,...,...,...,...,...,...,...,...,...,...
3171,xl170,0.6,read,,,,1535869242,-0.864026,/dev/sda4,1.0
3172,xl170,0.6,read,,,,1555957198,-11.289161,/dev/sda4,1.0
3173,xl170,0.6,read,,,,1561854084,-3.503756,/dev/sda4,1.0
3174,xl170,0.6,write,,,,1555957198,-12.762087,/dev/sda4,1.0


In [47]:
X = cpu_disk[['hw_type', 'lthreshold', 'testname', 'dvfs', 'socket_num', 'device', 'iodepth', 'total_threads', 'percent_change']]
y = cpu_disk[['timestamp']]
categorical_cols = X.columns[X.dtypes==object].tolist()
X[categorical_cols] = X[categorical_cols].fillna("")
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

CD = Data(X,y)
CD.xgb()
CD.lgb()
CD.cb()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


RMSE(XgBoost): 93.136902 days
RMSE(LightGBM): 94.127700 days
RMSE(CatBoost): 88.171939 days


### Memory+CPU+Disk###

In [69]:
memory_cpu_disk = pd.merge(cpu_disk,memory_cpu,how = 'outer',on = ['hw_type', 'lthreshold', 'testname', 'socket_num', 'dvfs', 'total_threads', 'timestamp', 'percent_change'])

In [70]:
X = memory_cpu_disk[['hw_type', 'lthreshold', 'testname', 'dvfs', 'device', 'socket_num', 'iodepth', 'total_threads', 'percent_change']]
y = memory_cpu_disk[['timestamp']]
categorical_cols = X.columns[X.dtypes==object].tolist()
X[categorical_cols] = X[categorical_cols].fillna("")
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))
MCD = Data(X,y)
MCD.xgb()
MCD.lgb()
MCD.cb()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


RMSE(XgBoost): 106.988454 days
RMSE(LightGBM): 105.651015 days
RMSE(CatBoost): 111.903522 days
