In [None]:
import pandas as pd
import numpy as np
import os, sys
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from tqdm.auto import tqdm
import json
sys.path.append('../')
from models.knobs import rocksdb_knobs_make_dict

In [None]:
PATH_internal = "internal"
PATH_external = "external"
PATH_knobs = "rocksdb_conf"
wk_len = 16

In [None]:
knobs = rocksdb_knobs_make_dict(PATH_knobs)
knobs = pd.DataFrame(data=knobs['data'].astype(np.float32), columns=knobs['columnlabels'])
columns = knobs.columns
knobs.head(4)

In [None]:
print("KNOB NAME \t\t\t\t sum of value counts")
for col in columns:
    print(f"{col:40}\t{len(knobs[[col]].value_counts())}")

In [None]:
index_value = dict()
for col in columns:
    iv = knobs[[col]].value_counts()#.reset_index(level=0).drop(columns=0)
    iv = iv.sort_index()
    index_value[col] = pd.Series(data=range(len(iv)), index=iv.index)

In [None]:
index_value

In [None]:
knobs_one_hot = torch.Tensor()
for i in tqdm(range(len(knobs))):   
    sample = torch.Tensor()
    for col in columns:
        knob_one_hot = torch.zeros(len(index_value[col]))
        knob_one_hot[index_value[col][knobs[col][i]]] = 1
        sample = torch.cat((sample, knob_one_hot))
    sample = sample.unsqueeze(0)
    knobs_one_hot = torch.cat((knobs_one_hot, sample))
knobs_one_hot.shape
np.save('knobsOneHot.npy', np.array(knobs_one_hot))

In [None]:
np.load('knobsOneHot.npy')

In [None]:
PATH_internal = "internal"
PATH_external = "external"
wk_len = 16
internal_dict = {}

pruned_im = pd.read_csv(os.path.join(PATH_internal, 'internal_ensemble_pruned_tmp.csv'), index_col=0)
for wk in range(wk_len):
    im = pd.read_csv(os.path.join(PATH_internal, f'internal_results_{wk}.csv'), index_col=0)
    internal_dict[wk] = im[pruned_im.columns]
    break
internal_dict[0].head(4)

In [None]:
external_dict = {}
for wk in range(wk_len):
    ex = pd.read_csv(os.path.join(PATH_external, f'external_results_{wk}.csv'), index_col=0)
    external_dict[wk] = ex
    break
external_dict[0].head(4)

## Test Train
- train: 16000, test: 4000

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
class RocksDBDataset(Dataset):
    def __init__(self, X, y):
        super(RocksDBDataset, self).__init__()
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return (self.X[idx], self.y[idx])

In [None]:
class SingleNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SingleNet, self).__init__()
        self.input_dim = input_dim # 3327
        self.hidden_dim = hidden_dim # 1024
        self.output_dim = output_dim # 148
        self.knob_fc = nn.Sequential(nn.Linear(self.input_dim, self.hidden_dim), nn.ReLU())
#         self.hidden = nn.Sequential(nn.Linear(self.hidden_dim, 64), nn.ReLU())
        self.im_fc = nn.Sequential(nn.Linear(self.hidden_dim, self.output_dim))

    def forward(self, x):
        self.x_kb = self.knob_fc(x)
#         self.h = self.hidden(self.x_kb)
        self.x_im = self.im_fc(self.x_kb)
        return self.x_im

In [None]:
def train(model, train_loader, lr):
    ## Construct optimizer
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    
    ## Set phase
    model.train()
    
    ## Train start
    total_loss = 0.
    for data, target in train_loader:
        ## data.shape = (batch_size, 22)
        ## target.shape = (batch_size, 1)
        ## initilize gradient
        optimizer.zero_grad()
        ## predict
        output = model(data) # output.shape = (batch_size, 1)
        ## loss
        loss = F.mse_loss(output, target)
        ## backpropagation
        loss.backward()
        optimizer.step()
        ## Logging
        total_loss += loss.item()
    total_loss /= len(train_loader)
    return total_loss

def valid(model, valid_loader):
    ## Set phase
    model.eval()
    
    ## Valid start    
    total_loss = 0.
    with torch.no_grad():
        for data, target in valid_loader:
            output = model(data)
            loss = F.mse_loss(output, target) # mean squared error
            total_loss += loss.item()
    total_loss /= len(valid_loader)
    return total_loss

In [None]:
knobOneHot = np.load('knobsOneHot.npy')
internal_m = internal_dict[0]
external_m = external_dict[0]
knobOneHot.shape, internal_m.shape, external_m.shape

In [None]:
X_tr, X_te, Im_tr, Im_te, y_tr, y_te, knob_tr, knob_te = \
            train_test_split(knobOneHot, internal_m, external_m, knobs, test_size=0.2, random_state=24)
X_tr.shape, X_te.shape, Im_tr.shape, Im_te.shape, y_tr.shape, y_te.shape, knob_tr.shape, knob_te.shape

### Pre-training

In [None]:
scaler_X = MinMaxScaler().fit(X_tr) # range: 0~1
scaler_knob = MinMaxScaler().fit(knob_tr)
scaler_Im = MinMaxScaler().fit(Im_tr)
scaler_y = StandardScaler().fit(y_tr)

# X_norm_tr = torch.Tensor(scaler_X.transform(X_tr)).cuda()
# X_norm_te = torch.Tensor(scaler_X.transform(X_te)).cuda()
X_tr = torch.Tensor(X_tr).cuda()
X_te = torch.Tensor(X_te).cuda()
Im_norm_tr = torch.Tensor(scaler_Im.transform(Im_tr)).cuda()
Im_norm_te = torch.Tensor(scaler_Im.transform(Im_te)).cuda()
y_norm_tr = torch.Tensor(scaler_y.transform(y_tr)).cuda()
y_norm_te = torch.Tensor(scaler_y.transform(y_te)).cuda()

Dataset_tr = RocksDBDataset(X_tr, Im_norm_tr)
Dataset_te = RocksDBDataset(X_te, Im_norm_te)

loader_tr = DataLoader(dataset = Dataset_tr, batch_size = 32, shuffle=True)
loader_te = DataLoader(dataset = Dataset_te, batch_size = 32, shuffle=True)

In [None]:
k2i_model = SingleNet(input_dim=X_tr.shape[1], hidden_dim=1024, output_dim=148).cuda()

In [None]:
lr = 0.001
epochs = 30
losses_tr = []

for epoch in range(epochs):
    loss_tr = train(k2i_model, loader_tr, lr)
    losses_tr.append(loss_tr)

    print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr}")
        
# print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr:.4f}")

### Train with knob2vec

In [None]:
lookup_table = k2i_model.knob_fc[0].weight.T.cpu().detach().numpy()
lookup_table.shape

In [None]:
def get_knob2vec(data, table):
    k2vec = np.zeros((data.shape[0], 22, table.shape[1]))
    for i in range(data.shape[0]):
        idx = (data[i]==1).nonzero().squeeze().cpu().detach().numpy()
        k2vec[i] = lookup_table[idx]
    return k2vec

In [None]:
K2vec_tr = torch.Tensor(get_knob2vec(X_tr, lookup_table)).cuda()
K2vec_tr = torch.reshape(K2vec_tr, (K2vec_tr.shape[0], -1))
K2vec_te = torch.Tensor(get_knob2vec(X_te, lookup_table)).cuda()
K2vec_te = torch.reshape(K2vec_te, (K2vec_te.shape[0], -1))

Dataset_K2vec_tr = RocksDBDataset(K2vec_tr, y_norm_tr)
Dataset_K2vec_te = RocksDBDataset(K2vec_te, y_norm_te)

loader_K2vec_tr = DataLoader(dataset = Dataset_K2vec_tr, batch_size = 32, shuffle=True)
loader_K2vec_te = DataLoader(dataset = Dataset_K2vec_te, batch_size = 32, shuffle=True)

In [None]:
model = SingleNet(input_dim=K2vec_tr.shape[-1], hidden_dim=64, output_dim=4).cuda()

In [None]:
lr = 0.001
epochs = 30
losses_tr = []
losses_te = []
for epoch in range(epochs):
    loss_tr = train(model, loader_K2vec_tr, lr)
    loss_te = valid(model, loader_K2vec_te)
    
    losses_tr.append(loss_tr)
    losses_te.append(loss_te)
    
    print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr}\tloss_te:{loss_te:.4f}")

### Train with raw knob

In [None]:
knob_norm_tr = torch.Tensor(scaler_knob.transform(knob_tr)).cuda()
knob_norm_te = torch.Tensor(scaler_knob.transform(knob_te)).cuda()

Dataset_knob_tr = RocksDBDataset(knob_norm_tr, y_norm_tr)
Dataset_knob_te = RocksDBDataset(knob_norm_te, y_norm_te)

loader_knob_tr = DataLoader(dataset = Dataset_knob_tr, batch_size = 32, shuffle=True)
loader_knob_te = DataLoader(dataset = Dataset_knob_te, batch_size = 32, shuffle=True)

In [None]:
knob_model = SingleNet(input_dim=knob_norm_tr.shape[1], hidden_dim=16, output_dim=4).cuda()

In [None]:
lr = 0.001
epochs = 30
losses_tr = []
losses_te = []
for epoch in range(epochs):
    loss_tr = train(knob_model, loader_knob_tr, lr)
    loss_te = valid(knob_model, loader_knob_te)
    
    losses_tr.append(loss_tr)
    losses_te.append(loss_te)
    
    print(f"[{epoch:02d}/{epochs}] loss_tr: {loss_tr}\tloss_te:{loss_te:.4f}")

In [None]:
pd.DataFrame(scaler_y.inverse_transform(y_norm_te.cpu().detach().numpy()))

In [None]:
pd.DataFrame(scaler_y.inverse_transform(model(K2vec_te).cpu().detach().numpy()))

In [None]:
_30 = scaler_y.inverse_transform(knob_model(knob_norm_te).cpu().detach().numpy()) # 30
_30

In [None]:
pd.DataFrame(_30)

In [None]:
pd.DataFrame(scaler_y.inverse_transform(knob_model(knob_norm_te).cpu().detach().numpy())) # 60