In [49]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
import xgboost
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from lightgbm import LGBMRegressor, plot_importance
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor

## local setting

In [3]:
# config data(.csv) -> DataFrame
config_df = pd.read_csv("./result_config10001_11000.csv", sep=',')

In [4]:
config_df

Unnamed: 0,appendonly,appendfsync,auto-aof-rewrite-percentage,auto-aof-rewrite-min-size,no-appendfsync-on-rewrite,aof-rewrite-incremental-fsync,aof-use-rdb-preamble,rdbcompression,rdbchecksum,rdb-save-incremental-fsync,...,maxmemory-policy,maxmemory-samples,lazyfree-lazy-eviction,lazyfree-lazy-expire,lazyfree-lazy-server-del,hash-max-ziplist-entries,hash-max-ziplist-value,activerehashing,hz,dynamic-hz
0,,,,,,,,no,no,no,...,volatile-lfu,7,no,yes,no,636,235,yes,7,no
1,,,,,,,,no,no,no,...,volatile-random,3,no,yes,yes,392,67,yes,21,yes
2,,,,,,,,no,no,yes,...,volatile-lfu,7,yes,yes,yes,699,94,yes,37,yes
3,,,,,,,,no,yes,no,...,volatile-lfu,7,yes,no,yes,400,136,no,9,no
4,,,,,,,,yes,yes,no,...,volatile-lru,7,yes,no,no,663,218,no,12,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,,,,,,,no,no,yes,...,allkeys-lfu,3,yes,yes,no,626,149,yes,13,no
996,,,,,,,,yes,yes,no,...,volatile-random,6,yes,yes,yes,745,129,yes,31,yes
997,,,,,,,,yes,no,no,...,volatile-lfu,6,yes,yes,yes,302,101,no,12,yes
998,,,,,,,,yes,yes,yes,...,volatile-lfu,4,no,yes,yes,291,67,yes,19,yes


In [17]:
# rdb knob으로 분류
config_df_rdb = config_df.loc[:,'rdbcompression':'dynamic-hz']

In [18]:
config_df_rdb.head()

Unnamed: 0,rdbcompression,rdbchecksum,rdb-save-incremental-fsync,activedefrag,active-defrag-threshold-lower,active-defrag-threshold-upper,active-defrag-cycle-min,active-defrag-cycle-max,maxmemory,maxmemory-policy,maxmemory-samples,lazyfree-lazy-eviction,lazyfree-lazy-expire,lazyfree-lazy-server-del,hash-max-ziplist-entries,hash-max-ziplist-value,activerehashing,hz,dynamic-hz
0,no,no,no,yes,7.0,83.0,12.0,83.0,,volatile-lfu,7,no,yes,no,636,235,yes,7,no
1,no,no,no,,,,,,,volatile-random,3,no,yes,yes,392,67,yes,21,yes
2,no,no,yes,yes,28.0,93.0,28.0,86.0,,volatile-lfu,7,yes,yes,yes,699,94,yes,37,yes
3,no,yes,no,,,,,,,volatile-lfu,7,yes,no,yes,400,136,no,9,no
4,yes,yes,no,,,,,,,volatile-lru,7,yes,no,no,663,218,no,12,no


In [6]:
# external data(.csv) -> DataFrame
external_df = pd.read_csv("./result_rdb_external_4.csv", sep=',')

In [5]:
# throuput 값으로 예상
external_df['Gets_KB/sec']

0      68121.32
1      67585.45
2      67425.93
3      67046.94
4      66755.71
         ...   
995    67567.17
996    68655.48
997    67605.50
998    67812.35
999    66795.58
Name: Gets_KB/sec, Length: 1000, dtype: float64

## 데이터 전처리

In [26]:
categorical_var = ['rdbcompression', 'rdbchecksum','rdb-save-incremental-fsync', 'activedefrag', 'maxmemory-policy','lazyfree-lazy-eviction','lazyfree-lazy-expire','lazyfree-lazy-server-del','activerehashing', 'dynamic-hz']

for cate_var in categorical_var:
    config_df_rdb[cate_var] = config_df_rdb[cate_var].astype('category') 


In [27]:
config_data = config_df_rdb
throughput_data = external_df['Gets_KB/sec']

In [45]:
# 넘파이로 변환!
config_data_np = config_data.to_numpy()
throughput_data_np = throughput_data.to_numpy()

In [46]:
config_data_np

array([['no', 'no', 'no', ..., 'yes', 7, 'no'],
       ['no', 'no', 'no', ..., 'yes', 21, 'yes'],
       ['no', 'no', 'yes', ..., 'yes', 37, 'yes'],
       ...,
       ['yes', 'no', 'no', ..., 'no', 12, 'yes'],
       ['yes', 'yes', 'yes', ..., 'yes', 19, 'yes'],
       ['no', 'yes', 'no', ..., 'yes', 34, 'yes']], dtype=object)

In [35]:
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(config_data, throughput_data ,test_size=0.2)


In [47]:
# 넘파이로 변환 후 
# train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(config_data_np, throughput_data_np ,test_size=0.2)

In [36]:
# lightgbm 데이터셋으로 변환
train_ds = lgb.Dataset(X_train, label=y_train)
test_ds = lgb.Dataset(X_test, label=y_test)

In [37]:
params = {'learning_rate': 0.01, 
          'max_depth': 16, 
          'boosting': 'gbdt', 
          'objective': 'regression', 
          'metric': 'mse', 
          'is_training_metric': True, 
          'num_leaves': 144, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5, 
          'seed':2018}

In [38]:
model = lgb.train(params, train_ds, 1000, test_ds, verbose_eval=100, early_stopping_rounds=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 618
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 18
[LightGBM] [Info] Start training from score 67536.131338
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 441030
Early stopping, best iteration is:
[65]	valid_0's l2: 438542




In [50]:
# 넘파이로 변환 후
lgbmc = LGBMClassifier(n_estimators = 400)
evals = [(X_test, y_test)]

lgbmc.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='logloss', eval_set=evals, verbose=True)
preds=lgbmc.predict(X_test)

ValueError: Unknown label type: 'continuous'

In [41]:
predict_train = model.predict(X_train)
predict_test = model.predict(X_test)

In [43]:
predict_test

array([67421.2500322 , 67574.96593877, 67542.54496478, 67595.03719466,
       67556.33341353, 67588.88689418, 67628.51816051, 67588.66642378,
       67650.2634678 , 67508.74044488, 67614.40947544, 67678.71888256,
       67558.04939737, 67447.84512734, 67355.77382286, 67521.38919389,
       67517.47679508, 67482.09759113, 67497.52384755, 67553.93576029,
       67375.00690651, 67429.3565497 , 67523.75360941, 67424.39016778,
       67550.77274851, 67385.26031335, 67512.93499739, 67558.29351989,
       67503.46996932, 67684.15786643, 67280.68136336, 67568.69237634,
       67312.99529453, 67632.57007484, 67636.14303369, 67554.12271889,
       67534.78251921, 67481.10281735, 67582.6097582 , 67364.1474873 ,
       67407.21242904, 67429.46986301, 67597.07568637, 67413.19273933,
       67480.83600268, 67646.65653419, 67526.63303013, 67545.38242882,
       67431.42021972, 67677.57592658, 67621.12551166, 67527.43527365,
       67593.36719676, 67506.94583503, 67351.68027074, 67430.07217874,
      

In [12]:
# 모델 생성
# 모델 생성
xgb_model = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)

In [13]:
# 모델 fitting
print(len(X_train), len(X_test))
xgb_model.fit(X_train,y_train)

900 100


ValueError: could not convert string to float: 'yes'

## LightGBM
XGboost 보다 성능이 띄어나고 속도가 빠르다.
XGboost의 경우 범주형 변수를 사용할 때 숫자형 변수로 바꿔줘야 한다. 이는 원핫인코딩을 통해서 이뤄진다. Xgboost는 트리 모델로 구현되는데 트리 모델은 범주의 갯수가 많은 변수를 원핫인코딩 하게 되면 트리가 언밸런스해지고, 깊이가 깊어진다고 한다. 깊이가 깊어짐에 따라 훈련하는데 시간이 오래걸리며 과적합의 위험이 존재하게 된다. 

하지만 LightGBM에서는 범주형 변수를 2개의 subset으로 잘 나누는 유용한 방법을 사용한다. 클래스를 분류하는데 가장 이득을 줄 수 있는 방법으로 이분화를 시킨다는 의미이다. 이렇게 되면 속도가 빨라지고 성능도 좋아진다고 한다. 범주형 변수의 type을 category로 지정만 해주면 모델이 알아서 처리해준다고 한다. 이렇게 되면 실제로 원한인코딩을 했을때 보다 단순히 카테고리 변수로 모델을 훈련시켰을때 성능이 더 좋아진다고 한다. 

In [14]:
# 
lgb = LGBMRegressor(n_estimators=400)

In [16]:
lgb.fit(X_train, y_train)

ValueError: could not convert string to float: 'yes'