In [1]:
import os
import pandas as pd
import numpy as np
import time
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### Load data ###

In [2]:
cpd = {}

for dataset in ["CPU", "Memory", "Disk"]:

    results_dir = "./%s_results" % dataset
    if dataset not in cpd.keys():
        cpd[dataset] = {} 

    for f in os.listdir("%s" % results_dir):
        # Exclude hidden files (most likely thumbnails)
        if f[:2] != "._":
            print("Loading: %s/%s" % (results_dir, f))
            df = pd.read_csv("%s/%s" % (results_dir, f))
            print(len(df))

            if "aggr" in f:
                # Extract CPD threshold from the filename 
                lthreshold = float(f.split("-")[1])

                df["lthreshold"] = lthreshold
                if "aggr" not in cpd[dataset].keys():
                    cpd[dataset]["aggr"] = pd.DataFrame()
                cpd[dataset]["aggr"] = pd.concat([cpd[dataset]["aggr"], df], sort=False)

            if "indiv" in f:
                if "indiv" not in cpd[dataset].keys():
                    cpd[dataset]["indiv"] = pd.DataFrame()
                cpd[dataset]["indiv"] = pd.concat([cpd[dataset]["indiv"], df], sort=False)

            if "meta" in f:
                cpd[dataset]["meta"] = df

            if "segment-durations" in f:
                cpd[dataset]["segment-durations"] = df

Loading: ./CPU_results/indiv-0.70
283
Loading: ./CPU_results/aggr-1.00
4034
Loading: ./CPU_results/indiv-0.40
147
Loading: ./CPU_results/aggr-0.30
354
Loading: ./CPU_results/aggr-0.90
3951
Loading: ./CPU_results/aggr-0.70
3068
Loading: ./CPU_results/indiv-1.00
410
Loading: ./CPU_results/aggr-0.40
930
Loading: ./CPU_results/meta
11
Loading: ./CPU_results/indiv-0.30
49
Loading: ./CPU_results/indiv-0.90
363
Loading: ./CPU_results/indiv-0.80
325
Loading: ./CPU_results/aggr-0.50
1944
Loading: ./CPU_results/segment-durations
6725
Loading: ./CPU_results/aggr-0.60
2459
Loading: ./CPU_results/aggr-0.80
3565
Loading: ./CPU_results/indiv-0.50
193
Loading: ./CPU_results/indiv-0.60
235
Loading: ./Memory_results/indiv-0.70
1406
Loading: ./Memory_results/aggr-1.00
6241
Loading: ./Memory_results/indiv-0.40
925
Loading: ./Memory_results/aggr-0.30
2656
Loading: ./Memory_results/aggr-0.90
6002
Loading: ./Memory_results/aggr-0.70
5420
Loading: ./Memory_results/indiv-1.00
1784
Loading: ./Memory_results/agg

In [3]:
count_df = pd.DataFrame(columns=["CPU: CP #", "Mem: CP #", "Disk: CP #"])
for lthreshold in cpd["CPU"]["indiv"]["lthreshold"].sort_values().unique():
    cpu_cpd_c = len(cpd["CPU"]["indiv"][(cpd["CPU"]["indiv"]["lthreshold"] == lthreshold)])
    
    mem_cpd_c = len(cpd["Memory"]["indiv"][(cpd["Memory"]["indiv"]["lthreshold"] == lthreshold)])
    
    disk_cpd_c = len(cpd["Disk"]["indiv"][(cpd["Disk"]["indiv"]["lthreshold"] == lthreshold)])
    
    count_df.loc[lthreshold] = (cpu_cpd_c, mem_cpd_c, disk_cpd_c)
    
count_df["Total #"] = count_df.sum(1)
count_df.loc['Total',:] = count_df.sum(0)
display(count_df)

Unnamed: 0,CPU: CP #,Mem: CP #,Disk: CP #,Total #
0.3,49,492,42,583.0
0.4,147,925,76,1148.0
0.5,193,1113,108,1414.0
0.6,235,1263,134,1632.0
0.7,283,1406,160,1849.0
0.8,325,1529,191,2045.0
0.9,363,1630,215,2208.0
1.0,410,1784,245,2439.0
Total,2005,10142,1171,13318.0


In [4]:
memory = cpd['Memory']['indiv']
memory.columns

Index(['hw_type', 'lthreshold', 'testname', 'dvfs', 'socket_num', 'timestamp',
       'percent_change'],
      dtype='object')

In [5]:
cpu = cpd['CPU']['indiv']
cpu.columns

Index(['hw_type', 'lthreshold', 'testname', 'total_threads', 'dvfs',
       'socket_num', 'timestamp', 'percent_change'],
      dtype='object')

In [6]:
disk = cpd['Disk']['indiv']
disk.columns

Index(['hw_type', 'lthreshold', 'testname', 'device', 'iodepth', 'timestamp',
       'percent_change'],
      dtype='object')

### XgBoost_memory ###

In [7]:
import xgboost as xgb

  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


In [37]:
X = memory[['hw_type','lthreshold','testname','dvfs','socket_num','percent_change']]
y = memory[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 5, subsample = 0.9, n_estimators = 1000)
xg_reg.fit(X_train,y_train)
predict = xg_reg.predict(X_test)



In [41]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(memory): %f" %(rmse/24/3600),"days")

RMSE(memory): 100.925054 days


### XgBoost_cpu ###

In [11]:
X = cpu[['hw_type','lthreshold','testname','total_threads','dvfs','socket_num','percent_change']]
y = cpu[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 5, subsample = 0.9, n_estimators = 1000)
xg_reg.fit(X_train,y_train)
predict = xg_reg.predict(X_test)



In [13]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(cpu): %f" %(rmse/24/3600),"days")

RMSE(cpu): 59.408048 days


### XgBoost_disk ###

In [14]:
X = disk[['hw_type', 'lthreshold', 'testname', 'device', 'iodepth', 'timestamp', 'percent_change']]
y = disk[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 5, subsample = 0.9, n_estimators = 1000)
xg_reg.fit(X_train,y_train)
predict = xg_reg.predict(X_test)



In [16]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(disk): %f" %(rmse/24/3600),"days")

RMSE(disk): 4.283519 days


### LightGBM_memory###

In [17]:
import lightgbm as lgb

In [18]:
X = memory[['hw_type','lthreshold','testname','dvfs','socket_num','percent_change']]
y = memory[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
lgb_reg = lgb.LGBMRegressor(objective = 'mse', colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 5, subsample = 0.9, n_estimators = 1000, random_state = 3)
lgb_reg.fit(X_train,y_train)
predict = lgb_reg.predict(X_test)



In [20]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(memory): %f" %(rmse/24/3600),"days")

RMSE(memory): 98.493888 days


### LightGBM_cpu###

In [21]:
X = cpu[['hw_type','lthreshold','testname','total_threads','dvfs','socket_num','percent_change']]
y = cpu[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
lgb_reg = lgb.LGBMRegressor(objective = 'mse', colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 5, subsample = 0.9, n_estimators = 1000, random_state = 3)
lgb_reg.fit(X_train,y_train)
predict = lgb_reg.predict(X_test)



In [23]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(cpu): %f" %(rmse/24/3600),"days")

RMSE(cpu): 53.941582 days


### LightGBM_disk###

In [24]:
X = disk[['hw_type', 'lthreshold', 'testname', 'device', 'iodepth', 'timestamp', 'percent_change']]
y = disk[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
lgb_reg = lgb.LGBMRegressor(objective = 'mse', colsample_bytree = 0.7, learning_rate = 0.1,
                max_depth = 5, subsample = 0.9, n_estimators = 1000, random_state = 3)
lgb_reg.fit(X_train,y_train)
predict = lgb_reg.predict(X_test)



In [26]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(disk): %f" %(rmse/24/3600),"days")

RMSE(disk): 7.773893 days


### CatBoost_memory###

In [27]:
import catboost as cb

In [28]:
X = memory[['hw_type','lthreshold','testname','dvfs','socket_num','percent_change']]
y = memory[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
cb_reg = cb.CatBoostRegressor(learning_rate = 0.1, max_depth = 5, verbose = 100, iterations = 1000, 
                              early_stopping_rounds = 200, eval_metric = 'RMSE')
cb_reg.fit(X_train,y_train)
predict = cb_reg.predict(X_test)

0:	learn: 18654443.5418442	total: 77.8ms	remaining: 1m 17s
100:	learn: 12528923.6391739	total: 285ms	remaining: 2.54s
200:	learn: 11229380.0765889	total: 442ms	remaining: 1.76s
300:	learn: 10396024.6114900	total: 619ms	remaining: 1.44s
400:	learn: 9812969.8800908	total: 772ms	remaining: 1.15s
500:	learn: 9306167.0987501	total: 933ms	remaining: 929ms
600:	learn: 8867258.9772308	total: 1.09s	remaining: 721ms
700:	learn: 8512805.0717410	total: 1.25s	remaining: 532ms
800:	learn: 8233149.6106941	total: 1.4s	remaining: 347ms
900:	learn: 7948256.7242913	total: 1.56s	remaining: 171ms
999:	learn: 7686873.3878281	total: 1.71s	remaining: 0us


In [30]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(memory): %f" %(rmse/24/3600),"days")

RMSE(memory): 99.801512 days


### CatBoost_cpu###

In [31]:
X = cpu[['hw_type','lthreshold','testname','total_threads','dvfs','socket_num','percent_change']]
y = cpu[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
cb_reg = cb.CatBoostRegressor(learning_rate = 0.1, max_depth = 5, verbose = 100, iterations = 1000, 
                              early_stopping_rounds = 200, eval_metric = 'RMSE')
cb_reg.fit(X_train,y_train)
predict = cb_reg.predict(X_test)

0:	learn: 11898006.4670571	total: 1.99ms	remaining: 1.98s
100:	learn: 5464737.7179244	total: 175ms	remaining: 1.55s
200:	learn: 4129122.6773024	total: 279ms	remaining: 1.11s
300:	learn: 3534898.9415882	total: 362ms	remaining: 841ms
400:	learn: 3169241.5822830	total: 448ms	remaining: 669ms
500:	learn: 2915052.1737593	total: 541ms	remaining: 539ms
600:	learn: 2692865.9791104	total: 633ms	remaining: 420ms
700:	learn: 2489288.8564780	total: 716ms	remaining: 305ms
800:	learn: 2303380.3940375	total: 811ms	remaining: 202ms
900:	learn: 2164901.4691374	total: 897ms	remaining: 98.6ms
999:	learn: 2045935.4246923	total: 982ms	remaining: 0us


In [33]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(cpu): %f" %(rmse/24/3600),"days")

RMSE(cpu): 49.244405 days


### CatBoost_disk###

In [34]:
X = disk[['hw_type', 'lthreshold', 'testname', 'device', 'iodepth', 'timestamp', 'percent_change']]
y = disk[['timestamp']]

categorical_cols = X.columns[X.dtypes==object].tolist()

le = LabelEncoder()

X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)
cb_reg = cb.CatBoostRegressor(learning_rate = 0.1, max_depth = 5, verbose = 100, iterations = 1000, 
                              early_stopping_rounds = 200, eval_metric = 'RMSE')
cb_reg.fit(X_train,y_train)
predict = cb_reg.predict(X_test)

0:	learn: 16887612.6939762	total: 1.43ms	remaining: 1.43s
100:	learn: 623945.7173379	total: 146ms	remaining: 1.3s
200:	learn: 375831.2418350	total: 225ms	remaining: 893ms
300:	learn: 271868.3343212	total: 295ms	remaining: 686ms
400:	learn: 213191.5840919	total: 362ms	remaining: 541ms
500:	learn: 173027.4210201	total: 442ms	remaining: 440ms
600:	learn: 144438.7978839	total: 512ms	remaining: 340ms
700:	learn: 125311.9291876	total: 579ms	remaining: 247ms
800:	learn: 110058.6426640	total: 664ms	remaining: 165ms
900:	learn: 99312.4836234	total: 730ms	remaining: 80.3ms
999:	learn: 88394.6169232	total: 796ms	remaining: 0us


In [36]:
rmse = np.sqrt(mean_squared_error(y_test, predict))
print("RMSE(disk): %f" %(rmse/24/3600),"days")

RMSE(disk): 4.022586 days
