In [None]:
import pandas as pd
import numpy as np
import os, sys
import torch
import json
sys.path.append('../')
from models.utils import rocksdb_knobs_make_dict

In [None]:
PATH_internal = "../data/internal"
PATH_external = "../data/external"
PATH_knobs = "../data/rocksdb_conf"
wk_len = 16

In [None]:
knobs = rocksdb_knobs_make_dict(PATH_knobs)
knobs = pd.DataFrame(data=knobs['data'].astype(np.float32), columns=knobs['columnlabels'])
columns = knobs.columns
knobs.head(4)

In [None]:
knobs[:128]

In [None]:
len(knobs)

In [None]:
knobs.to_numpy()

In [None]:
wk_len = 16
internal_dict = {}

pruned_im = pd.read_csv(os.path.join(PATH_internal, 'internal_ensemble_pruned_tmp.csv'), index_col=0)
for wk in range(wk_len):
    im = pd.read_csv(os.path.join(PATH_internal, f'internal_results_{wk}.csv'), index_col=0)
    internal_dict[wk] = im[pruned_im.columns]
#     break
internal_dict[0].head(4)

In [None]:
external_dict = {}
for wk in range(wk_len):
    ex = pd.read_csv(os.path.join(PATH_external, f'external_results_{wk}.csv'), index_col=0)
    external_dict[wk] = ex
#     break
external_dict[0].head(4)

## Test Train
- train: 16000, test: 4000
- target: 0 th workload

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.utils.data import Dataset, DataLoader
from models.network import RocksDBDataset, SingleNet#, EncoderRNN, DecoderRNN
from models.train import train, valid#, trainRNN, validRNN

In [None]:
knobOneHot = np.load('../data/knobsOneHot.npy')
internal_m = internal_dict[2] # similar datasets
external_m = external_dict[2] # similar datasets
knobOneHot.shape, internal_m.shape, external_m.shape

In [None]:
X_tr, X_te, Im_tr, Im_te, y_tr, y_te, knob_tr, knob_te = \
            train_test_split(knobOneHot, internal_m, external_m, knobs, test_size=0.2, random_state=24)
X_tr.shape, X_te.shape, Im_tr.shape, Im_te.shape, y_tr.shape, y_te.shape, knob_tr.shape, knob_te.shape

In [None]:
y_te.to_numpy()[0,:]

In [None]:
Im_tr.shape

### Pre-training

In [None]:
scaler_X = MinMaxScaler().fit(X_tr) # range: 0~1
scaler_knob = MinMaxScaler().fit(knob_tr)
scaler_Im = MinMaxScaler().fit(Im_tr)
scaler_y = StandardScaler().fit(y_tr)

# X_norm_tr = torch.Tensor(scaler_X.transform(X_tr)).cuda()
# X_norm_te = torch.Tensor(scaler_X.transform(X_te)).cuda()
X_tr = torch.Tensor(X_tr).cuda()
X_te = torch.Tensor(X_te).cuda()
y_norm_tr = torch.Tensor(scaler_y.transform(y_tr)).cuda()
y_norm_te = torch.Tensor(scaler_y.transform(y_te)).cuda()

In [None]:
Im_norm_tr = torch.Tensor(scaler_Im.transform(Im_tr)).cuda()
Im_norm_te = torch.Tensor(scaler_Im.transform(Im_te)).cuda()
Dataset_tr = RocksDBDataset(X_tr, Im_norm_tr)
Dataset_te = RocksDBDataset(X_te, Im_norm_te)

loader_tr = DataLoader(dataset = Dataset_tr, batch_size = 32, shuffle=True)
loader_te = DataLoader(dataset = Dataset_te, batch_size = 32, shuffle=False)

In [None]:
k2i_model = SingleNet(input_dim=X_tr.shape[1], hidden_dim=1024, output_dim=148).cuda()

In [None]:
lr = 0.001
epochs = 30
losses_tr = []

for epoch in range(epochs):
    loss_tr = train(k2i_model, loader_tr, lr)
    losses_tr.append(loss_tr)

    print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr}")
        
# print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr:.4f}")

### Train with knob2vec

In [None]:
lookup_table = k2i_model.knob_fc[0].weight.T.cpu().detach().numpy()
# lookup_table = np.load('LookupTable.npy')
lookup_table.shape

In [None]:
# np.save('LookupTable.npy', lookup_table)

In [None]:
def get_knob2vec(data, table):
    k2vec = np.zeros((data.shape[0], 22, table.shape[1]))
    for i in range(data.shape[0]):
#         idx = (data[i]==1).nonzero().squeeze().cpu().detach().numpy()
        idx = (data[i]==1).nonzero().squeeze().cpu().detach().numpy()
        k2vec[i] = lookup_table[idx]
    return k2vec

In [None]:
K2vec_tr = torch.Tensor(get_knob2vec(X_tr, lookup_table)).cuda()
K2vec_tr = torch.reshape(K2vec_tr, (K2vec_tr.shape[0], -1))
K2vec_te = torch.Tensor(get_knob2vec(X_te, lookup_table)).cuda()
K2vec_te = torch.reshape(K2vec_te, (K2vec_te.shape[0], -1))

Dataset_K2vec_tr = RocksDBDataset(K2vec_tr, y_norm_tr)
Dataset_K2vec_te = RocksDBDataset(K2vec_te, y_norm_te)

loader_K2vec_tr = DataLoader(dataset = Dataset_K2vec_tr, batch_size = 32, shuffle=True)
loader_K2vec_te = DataLoader(dataset = Dataset_K2vec_te, batch_size = 32, shuffle=False)

In [None]:
K2vec_tr.shape

In [None]:
model = SingleNet(input_dim=K2vec_tr.shape[-1], hidden_dim=64, output_dim=4).cuda()

In [None]:
lr = 0.001
epochs = 30
losses_tr = []
losses_te = []
for epoch in range(epochs):
    loss_tr = train(model, loader_K2vec_tr, lr)
    loss_te, outputs = valid(model, loader_K2vec_te)
    
    losses_tr.append(loss_tr)
    losses_te.append(loss_te)
    
    print(f'[{epoch}/{epochs}] loss_tr: {loss_tr:.8f}\tloss_te:{loss_te:.8f}')

In [None]:
pred = np.round(scaler_y.inverse_transform(outputs.cpu().detach().numpy()),2)
true = y_te.to_numpy()

In [None]:
from sklearn.metrics import r2_score

In [None]:
for i in range(10):
    print(f'predict rslt: {pred[i]}')
    print(f'ground truth: {true[i]}\n')

In [None]:
for i in external_dict[0].columns:
    print(i)

In [None]:
score = r2_score(true, pred, multioutput='raw_values')
ex_col = external_dict[0].columns
for i, c in enumerate(ex_col):
    print(f'{c:4} r2 score = {score[i]:.4f}')

In [None]:
r2_score(true, pred, multioutput='raw_values')

### Train with knob2vec and GRU

In [None]:
K2vec_tr = torch.Tensor(get_knob2vec(X_tr, lookup_table)).cuda()
K2vec_te = torch.Tensor(get_knob2vec(X_te, lookup_table)).cuda()

Dataset_K2vec_tr = RocksDBDataset(K2vec_tr, y_norm_tr)
Dataset_K2vec_te = RocksDBDataset(K2vec_te, y_norm_te)

loader_K2vec_tr = DataLoader(dataset = Dataset_K2vec_tr, batch_size = 32, shuffle=True)
loader_K2vec_te = DataLoader(dataset = Dataset_K2vec_te, batch_size = 32, shuffle=True)

In [None]:
encoder = EncoderRNN(hidden_dim=K2vec_tr.shape[-1]).cuda()
decoder = DecoderRNN(hidden_dim=K2vec_tr.shape[-1], output_dim=1).cuda()

In [None]:
lr = 0.001
epochs = 30
losses_tr = []
losses_te = []
for epoch in range(epochs):
    loss_tr = trainRNN(encoder, decoder, loader_K2vec_tr, lr)
    loss_te = validRNN(encoder, decoder, loader_K2vec_te)
    
    losses_tr.append(loss_tr)
    losses_te.append(loss_te)
    
    print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr}\tloss_te:{loss_te:.4f}")

### Train with raw knob

In [None]:
knob_norm_tr = torch.Tensor(scaler_knob.transform(knob_tr)).cuda()
knob_norm_te = torch.Tensor(scaler_knob.transform(knob_te)).cuda()

Dataset_knob_tr = RocksDBDataset(knob_norm_tr, y_norm_tr)
Dataset_knob_te = RocksDBDataset(knob_norm_te, y_norm_te)

loader_knob_tr = DataLoader(dataset = Dataset_knob_tr, batch_size = 32, shuffle=True)
loader_knob_te = DataLoader(dataset = Dataset_knob_te, batch_size = 32, shuffle=True)

In [None]:
knob_model = SingleNet(input_dim=knob_norm_tr.shape[1], hidden_dim=16, output_dim=4).cuda()

In [None]:
lr = 0.001
epochs = 30
losses_tr = []
losses_te = []
for epoch in range(epochs):
    loss_tr = train(knob_model, loader_knob_tr, lr)
    loss_te = valid(knob_model, loader_knob_te)
    
    losses_tr.append(loss_tr)
    losses_te.append(loss_te)
    
    print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr}\tloss_te:{loss_te:.4f}")

In [None]:
pd.DataFrame(scaler_y.inverse_transform(y_norm_te.cpu().detach().numpy()))

In [None]:
pd.DataFrame(scaler_y.inverse_transform(model(K2vec_te).cpu().detach().numpy()))

In [None]:
_30 = scaler_y.inverse_transform(knob_model(knob_norm_te).cpu().detach().numpy()) # 30
_30

In [None]:
pd.DataFrame(_30)

In [None]:
pd.DataFrame(scaler_y.inverse_transform(knob_model(knob_norm_te).cpu().detach().numpy())) # 60