In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [72]:
BASE_PATH = '../benchmark/metrics'
percentile = '90th'

schedule = pd.read_csv('%s/schedule.csv' % BASE_PATH, index_col=0)
params = schedule.columns
indexes = schedule.index
# scaling scheule
scaler = MinMaxScaler()
schedule = pd.DataFrame(scaler.fit_transform(schedule), columns=params, index=indexes)

latency = pd.read_csv('%s/processed/latency.csv' % BASE_PATH, index_col=0)
states = pd.read_csv('%s/processed/states_%s.csv' % (BASE_PATH, percentile), index_col=0)
states_cols = pd.read_csv('../benchmark/meta/state_meta.csv')['name']
states = states[states_cols]

In [73]:
%%script falses
df = pd.concat([latency, states], axis=1)
fig, ax = plt.subplots(figsize=(15, 10))  
ans = sns.heatmap(df.corr(), linewidths=.5, xticklabels=df.columns, yticklabels=df.columns, ax=ax)

Couldn't find program: 'falses'


In [74]:
from sklearn.model_selection import train_test_split

# spliting training and testing sets
X = schedule
y = pd.concat([states, latency], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
states_y_train = y_train[states.columns]
states_y_test = y_test[states.columns]
latency_y_train = y_train[latency.columns]
latency_y_test = y_test[latency.columns]

In [75]:
# Construct env prediction model using RF or GB
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error, median_absolute_error, mean_squared_log_error, mean_absolute_error, explained_variance_score, r2_score
import numpy as np
from sklearn.externals import joblib

# build regression model for every column
def train(my_X, my_y, cols):
    for i, col in enumerate(cols):
        y = my_y[col].tolist()

        # evaluate the model and collect the scores
        best_model = None
        best_score = -1e6

        # choose the best model from RF and GB
        rf = RandomForestRegressor(random_state=1000)
        # gb = GradientBoostingRegressor(random_state=1000)
        models = [rf]
        for model in models:
            # define the evaluation procedure
            cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=43)
            n_scores = cross_val_score(model, my_X, y, scoring='r2', cv=cv, n_jobs=-1)
            if np.mean(n_scores) > best_score:
                best_model = model
                best_score = np.mean(n_scores)
                model.fit(X=my_X, y=y)
                joblib.dump(best_model, './models/%s.joblib' % col)
            print('%s: %.3f (%.3f)' % (col, np.mean(n_scores), np.std(n_scores)))

In [76]:
# accurancy of testing set
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import ceil

def test(my_X, my_y, cols):
    for col in cols:
        model = joblib.load('./models/%s.joblib' % col)
        score = model.score(my_X, my_y[col])
        print('Test %s: %.3f' % (col, score))

In [77]:
train(X_train, states_y_train, states.columns)

network_request_FetchConsumer_TotalTimeMs: 0.705 (0.033)
network_request_Fetch_MessageConversionsTimeMs: 0.704 (0.033)
network_request_Produce_TotalTimeMs: 0.713 (0.063)
network_socket_FetchConsumer_NetworkProcessorAvgIdlePercent: 0.981 (0.008)
network_socket_FetchConsumer_RequestHandlerAvgIdlePercent: 0.953 (0.032)
os_open_fd_count: 0.999 (0.000)
os_process_cpu_time: 0.848 (0.014)
server_broker_topics_TotalProduceRequestsPerSec: 0.853 (0.035)
server_broker_topics_AllTopicsBytesIn: 0.853 (0.014)
server_broker_topics_AllTopicsBytesOut: 0.712 (0.035)
threading_thread_count: 0.996 (0.001)


In [78]:
test(X_test, states_y_test, states.columns)

Test network_request_FetchConsumer_TotalTimeMs: 0.717
Test network_request_Fetch_MessageConversionsTimeMs: 0.717
Test network_request_Produce_TotalTimeMs: 0.700
Test network_socket_FetchConsumer_NetworkProcessorAvgIdlePercent: 0.982
Test network_socket_FetchConsumer_RequestHandlerAvgIdlePercent: 0.974
Test os_open_fd_count: 0.999
Test os_process_cpu_time: 0.851
Test server_broker_topics_TotalProduceRequestsPerSec: 0.848
Test server_broker_topics_AllTopicsBytesIn: 0.846
Test server_broker_topics_AllTopicsBytesOut: 0.724
Test threading_thread_count: 0.996


In [79]:
train(X_train, latency_y_train, latency.columns)

min: 0.914 (0.011)
max: 0.945 (0.006)
median: 0.996 (0.000)
mean: 0.996 (0.000)
std: 0.992 (0.001)
25th: 0.986 (0.002)
50th: 0.996 (0.000)
75th: 0.998 (0.000)
90th: 0.998 (0.000)
95th: 0.998 (0.000)
99th: 0.984 (0.002)


In [80]:
test(X_test, latency_y_test, latency.columns)

Test min: 0.907
Test max: 0.946
Test median: 0.996
Test mean: 0.996
Test std: 0.993
Test 25th: 0.986
Test 50th: 0.996
Test 75th: 0.998
Test 90th: 0.999
Test 95th: 0.998
Test 99th: 0.984
