# BO RDB v

* 모델 생성
* 목적함수 생성

In [28]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import xgboost
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from lightgbm import LGBMRegressor, plot_importance
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor

## 모델 생성

In [29]:
knobs_rdb = {
#     "appendonly": None,
#     "appendfsync": None,
#     "auto-aof-rewrite-percentage": None,
#     "auto-aof-rewrite-min-size": None,
#     "no-appendfsync-on-rewrite": None,
#     "aof-rewrite-incremental-fsync": None,
#     "aof-use-rdb-preamble": None,

    # "save": None,
    "rdbcompression": None,
    "rdbchecksum": None,
    "rdb-save-incremental-fsync": None,

    "activedefrag": None,
    "active-defrag-threshold-lower": None,
    "active-defrag-threshold-upper": None,
    "active-defrag-cycle-min": None,
    "active-defrag-cycle-max": None,

    # "maxmemory": None,
    "maxmemory-policy": None,
    "maxmemory-samples": None,
    "lazyfree-lazy-eviction": None,
    "lazyfree-lazy-expire": None,
    "lazyfree-lazy-server-del": None,

    "hash-max-ziplist-entries": None,
    "hash-max-ziplist-value": None,
    "activerehashing": None,
    "hz": None,
    "dynamic-hz": None
}

knob_list = list(knobs_rdb.keys())

In [30]:
knob_list

['rdbcompression',
 'rdbchecksum',
 'rdb-save-incremental-fsync',
 'activedefrag',
 'active-defrag-threshold-lower',
 'active-defrag-threshold-upper',
 'active-defrag-cycle-min',
 'active-defrag-cycle-max',
 'maxmemory-policy',
 'maxmemory-samples',
 'lazyfree-lazy-eviction',
 'lazyfree-lazy-expire',
 'lazyfree-lazy-server-del',
 'hash-max-ziplist-entries',
 'hash-max-ziplist-value',
 'activerehashing',
 'hz',
 'dynamic-hz']

## local setting

In [31]:
# config data(.csv) -> DataFrame
config_df = pd.read_csv("./result_config10001_11000.csv", sep=',')

In [32]:
config_df

Unnamed: 0,appendonly,appendfsync,auto-aof-rewrite-percentage,auto-aof-rewrite-min-size,no-appendfsync-on-rewrite,aof-rewrite-incremental-fsync,aof-use-rdb-preamble,rdbcompression,rdbchecksum,rdb-save-incremental-fsync,...,maxmemory-policy,maxmemory-samples,lazyfree-lazy-eviction,lazyfree-lazy-expire,lazyfree-lazy-server-del,hash-max-ziplist-entries,hash-max-ziplist-value,activerehashing,hz,dynamic-hz
0,,,,,,,,no,no,no,...,volatile-lfu,7,no,yes,no,636,235,yes,7,no
1,,,,,,,,no,no,no,...,volatile-random,3,no,yes,yes,392,67,yes,21,yes
2,,,,,,,,no,no,yes,...,volatile-lfu,7,yes,yes,yes,699,94,yes,37,yes
3,,,,,,,,no,yes,no,...,volatile-lfu,7,yes,no,yes,400,136,no,9,no
4,,,,,,,,yes,yes,no,...,volatile-lru,7,yes,no,no,663,218,no,12,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,,,,,,no,no,yes,...,allkeys-lfu,3,yes,yes,no,626,149,yes,13,no
996,,,,,,,,yes,yes,no,...,volatile-random,6,yes,yes,yes,745,129,yes,31,yes
997,,,,,,,,yes,no,no,...,volatile-lfu,6,yes,yes,yes,302,101,no,12,yes
998,,,,,,,,yes,yes,yes,...,volatile-lfu,4,no,yes,yes,291,67,yes,19,yes


In [33]:
config_df['activedefrag'].unique()

array(['yes', nan], dtype=object)

In [34]:
# rdb knob으로 분류, loc로 인덱싱할 경우 df 출력, 그냥 인덱싱하면 serise로 출력
config_df_rdb = config_df.loc[:,'rdbcompression':'dynamic-hz']

In [35]:
config_df_rdb.head()

Unnamed: 0,rdbcompression,rdbchecksum,rdb-save-incremental-fsync,activedefrag,active-defrag-threshold-lower,active-defrag-threshold-upper,active-defrag-cycle-min,active-defrag-cycle-max,maxmemory,maxmemory-policy,maxmemory-samples,lazyfree-lazy-eviction,lazyfree-lazy-expire,lazyfree-lazy-server-del,hash-max-ziplist-entries,hash-max-ziplist-value,activerehashing,hz,dynamic-hz
0,no,no,no,yes,7.0,83.0,12.0,83.0,,volatile-lfu,7,no,yes,no,636,235,yes,7,no
1,no,no,no,,,,,,,volatile-random,3,no,yes,yes,392,67,yes,21,yes
2,no,no,yes,yes,28.0,93.0,28.0,86.0,,volatile-lfu,7,yes,yes,yes,699,94,yes,37,yes
3,no,yes,no,,,,,,,volatile-lfu,7,yes,no,yes,400,136,no,9,no
4,yes,yes,no,,,,,,,volatile-lru,7,yes,no,no,663,218,no,12,no


In [36]:
# external data(.csv) -> DataFrame
external_df = pd.read_csv("./result_rdb_external_4.csv", sep=',')

In [37]:
# throuput 값으로 예상
external_df['Gets_KB/sec']

0      68121.32
1      67585.45
2      67425.93
3      67046.94
4      66755.71
         ...   
995    67567.17
996    68655.48
997    67605.50
998    67812.35
999    66795.58
Name: Gets_KB/sec, Length: 1000, dtype: float64

In [38]:
# 카테고리 형 처리
categorical_var = ['rdbcompression', 'rdbchecksum','rdb-save-incremental-fsync', 'activedefrag', 'maxmemory-policy','lazyfree-lazy-eviction','lazyfree-lazy-expire','lazyfree-lazy-server-del','activerehashing', 'dynamic-hz']

for cate_var in categorical_var:
    config_df_rdb[cate_var] = config_df_rdb[cate_var].astype('category') 


In [39]:
# maxmemrory 생략
config_df_rdb = config_df_rdb.drop('maxmemory', axis=1)

In [40]:
config_data = config_df_rdb
throughput_data = external_df['Gets_KB/sec']

In [41]:
print(type(config_data))
print(type(throughput_data))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [42]:
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(config_data, throughput_data ,test_size=0.2)


In [43]:
fit_params={"early_stopping_rounds":10, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            'verbose': 100,
            'feature_name': 'auto', # that's actually the default
            'categorical_feature': 'auto' # that's actually the default
           }

In [44]:
lgbr = lgb.LGBMRegressor(num_leaves= 15, max_depth=-1, 
                         random_state=314, 
                         silent=True, 
                         metric='None', 
                         n_jobs=4, 
                         n_estimators=1000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.1)


In [45]:
lgbr.fit(X_train, y_train, **fit_params)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	valid's auc: 1




LGBMRegressor(colsample_bytree=0.9, metric='None', n_estimators=1000, n_jobs=4,
              num_leaves=15, random_state=314, subsample=0.9)

In [46]:
y_predict = lgbr.predict(X_test)

---

In [47]:
# none 존재여부 확인
config_data.head()

# for i in knob_list:
#     print(config_data[i].unique)
config_data.isnull().sum()

rdbcompression                     0
rdbchecksum                        0
rdb-save-incremental-fsync         0
activedefrag                     490
active-defrag-threshold-lower    490
active-defrag-threshold-upper    490
active-defrag-cycle-min          490
active-defrag-cycle-max          490
maxmemory-policy                   0
maxmemory-samples                  0
lazyfree-lazy-eviction             0
lazyfree-lazy-expire               0
lazyfree-lazy-server-del           0
hash-max-ziplist-entries           0
hash-max-ziplist-value             0
activerehashing                    0
hz                                 0
dynamic-hz                         0
dtype: int64

# Objective Func-목적함수

In [48]:
X_test.head()

Unnamed: 0,rdbcompression,rdbchecksum,rdb-save-incremental-fsync,activedefrag,active-defrag-threshold-lower,active-defrag-threshold-upper,active-defrag-cycle-min,active-defrag-cycle-max,maxmemory-policy,maxmemory-samples,lazyfree-lazy-eviction,lazyfree-lazy-expire,lazyfree-lazy-server-del,hash-max-ziplist-entries,hash-max-ziplist-value,activerehashing,hz,dynamic-hz
144,yes,no,yes,yes,28.0,88.0,26.0,70.0,volatile-random,7,no,yes,no,469,188,yes,23,yes
214,no,yes,yes,yes,5.0,92.0,1.0,88.0,volatile-lru,5,yes,yes,yes,298,223,yes,35,no
409,no,yes,yes,,,,,,volatile-lfu,4,no,no,yes,735,53,yes,1,no
917,yes,yes,yes,,,,,,volatile-lfu,5,no,no,yes,579,231,yes,17,yes
610,yes,yes,no,yes,14.0,100.0,30.0,89.0,allkeys-lru,4,no,no,no,608,106,yes,33,no


In [49]:
# 테스트
knob_value = list(X_test.loc[58])

KeyError: 58

In [50]:
knob_value = ['no',
 'yes',
 'yes',
 'no',
 4.0,
 80.0,
 19.0,
 89.0,
 'volatile-lru',
 5,
 'no',
 'yes',
 'no',
 526,
 32,
 'yes',
 4,
 'yes']

In [51]:
noise_level = 0.1

def f(knob_value, noise_level=noise_level):
    knob_dict = {}

    for i, knob in enumerate(knob_list): # 딕셔너리로 이동
        knob_dict[knob] = knob_value[i]
    
    if knob_dict['activedefrag'] == 'no':  # activedefrag no인 경우 나머지 관련 knob들에 0할당!
        activ_var = ['active-defrag-threshold-lower', 'active-defrag-threshold-upper', 'active-defrag-cycle-min', 'active-defrag-cycle-max']
        for activ_knob in activ_var:
            knob_dict[activ_knob] = 0     
    
    knob_df = pd.DataFrame(knob_dict, index=[0])  # 데이터 프레임으로 변환
    
    categorical_var = ['rdbcompression', 'rdbchecksum','rdb-save-incremental-fsync', 'activedefrag', 'maxmemory-policy','lazyfree-lazy-eviction','lazyfree-lazy-expire','lazyfree-lazy-server-del','activerehashing', 'dynamic-hz']
    for cate_var in categorical_var:  # 카테고리 형으로 변환
        knob_df[cate_var] = knob_df[cate_var].astype('category')

    return -lgbr.predict(knob_df)[0]


#     change_numeric = ['active-defrag-threshold-lower', 'active-defrag-threshold-upper', 'active-defrag-cycle-min', 'active-defrag-cycle-max']


In [52]:
# knob 바운드
knob_bound = [
    # (rdbcompression), (rdbchecksum), (rdb-save-incremental-fsync)
    ('yes', 'no'), ('yes', 'no'), ('yes', 'no'),
    
    # (activedefrag), (active-defrag-threshold-lower), (active-defrag-threshold-upper), (active-defrag-cycle-min), (active-defrag-cycle-max)
    ('yes', 'no'), (1, 31), (70, 101), (1, 31), (70, 91),
    
    # maxmemory -> 수정 필요!
    ## (maxmemmory-policy),  
    ("volatile-lru", "allkeys-lru", "volatile-lfu", "allkeys-lfu", "volatile-random","allkeys-random", "volatile-ttl", "noeviction"),
    ## (maxmemory-samples)
    (3, 7),
    
    #(lazyfree-lazy-eviction), (lazyfree-lazy-expire), (lazyfree-lazy-server-del)
    ('yes', 'no'), ('yes', 'no'), ('yes', 'no'),
    
    
    # (hash-max-ziplist=entries), (hash-max-ziplist-value)
    (256, 751), (16, 257),
    
    # activerehashing, hz, dynamic-hz
    ('yes', 'no'), (1, 41), ('yes', 'no')
]

In [78]:
type(knob_bound[0][0])

str

## gp_minimize

In [53]:
%matplotlib inline
print(__doc__)

import numpy as np
np.random.seed(237)
import matplotlib.pyplot as plt
from skopt.plots import plot_gaussian_process
from skopt import gp_minimize

Automatically created module for IPython interactive environment


In [72]:
res = gp_minimize(f,                  # the function to minimize
                  knob_bound,      # the bounds on each dimension of x
                  acq_func="EI",      # the acquisition function
                  n_calls=15,         # the number of evaluations of f
                  n_random_starts=5,  # the number of random initialization points
                  noise=0.1**2,       # the noise level (optional)
                  random_state=1234)   # the random seed

In [73]:
res.x

['no',
 'yes',
 'yes',
 'no',
 25,
 92,
 13,
 77,
 'allkeys-lru',
 3,
 'no',
 'no',
 'yes',
 256,
 96,
 'no',
 26,
 'no']

In [74]:
# random_state = 1233, 67534.46096
# random_state = 1234, 67542.50152
res.fun 

-67542.50152768569