In [1]:
# No leak data
import pandas as pd
import numpy as np
import gc 
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import datetime


In [2]:
# Bayesian Optimization Functions

# https://www.kaggle.com/aitude/ashrae-hyperparameter-tuning

In [3]:
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')

In [4]:
# some feature enginnering

train_df['age'] = train_df['year_built'].max()-train_df['year_built']+1
test_df['age'] = test_df['year_built'].max() - test_df['year_built'] + 1

#Handling missing values
# To streamline this though process it is useful to know the 3 categories in which missing data can be classified into:

# Missing Completely at Random (MCAR)
# Missing at Random (MAR)
# Missing Not at Random (MNAR)

train_df['floor_count'] = train_df['floor_count'].fillna(-999).astype(np.int16)
test_df['floor_count'] = test_df['floor_count'].fillna(-999).astype(np.int16)

train_df['year_built'] = train_df['year_built'].fillna(-999).astype(np.int16)
test_df['year_built'] = test_df['year_built'].fillna(-999).astype(np.int16)

train_df['age'] = train_df['age'].fillna(-999).astype(np.int16)
test_df['age'] = test_df['age'].fillna(-999).astype(np.int16)

train_df['cloud_coverage'] = train_df['cloud_coverage'].fillna(-999).astype(np.int16)
test_df['cloud_coverage'] = test_df['cloud_coverage'].fillna(-999).astype(np.int16) 

train_df['square_feet'] = np.log1p(train_df['square_feet'])
test_df['square_feet'] = np.log1p(test_df['square_feet'])

In [5]:
def make_is_bad_zero(Xy_subset, min_interval=48, summer_start=3000, summer_end=7500):
    """Helper routine for 'find_bad_zeros'.
    
    This operates upon a single dataframe produced by 'groupby'. We expect an 
    additional column 'meter_id' which is a duplicate of 'meter' because groupby 
    eliminates the original one."""
    meter = Xy_subset.meter_id.iloc[0]
    is_zero = Xy_subset.meter_reading == 0
    if meter == 0:
        # Electrical meters should never be zero. Keep all zero-readings in this table so that
        # they will all be dropped in the train set.
        return is_zero

    transitions = (is_zero != is_zero.shift(1))
    all_sequence_ids = transitions.cumsum()
    ids = all_sequence_ids[is_zero].rename("ids")
    if meter in [2, 3]:
        # It's normal for steam and hotwater to be turned off during the summer
        keep = set(ids[(Xy_subset.timestamp < summer_start) |
                       (Xy_subset.timestamp > summer_end)].unique())
        is_bad = ids.isin(keep) & (ids.map(ids.value_counts()) >= min_interval)
    elif meter == 1:
        time_ids = ids.to_frame().join(Xy_subset.timestamp).set_index("timestamp").ids
        is_bad = ids.map(ids.value_counts()) >= min_interval

        # Cold water may be turned off during the winter
        jan_id = time_ids.get(0, False)
        dec_id = time_ids.get(8283, False)
        if (jan_id and dec_id and jan_id == time_ids.get(500, False) and
                dec_id == time_ids.get(8783, False)):
            is_bad = is_bad & (~(ids.isin(set([jan_id, dec_id]))))
    else:
        raise Exception(f"Unexpected meter type: {meter}")

    result = is_zero.copy()
    result.update(is_bad)
    return result

def find_bad_zeros(X, y):
    """Returns an Index object containing only the rows which should be deleted."""
    Xy = X.assign(meter_reading=y, meter_id=X.meter)
    is_bad_zero = Xy.groupby(["building_id", "meter"]).apply(make_is_bad_zero)
    return is_bad_zero[is_bad_zero].index.droplevel([0, 1])

def find_bad_sitezero(X):
    """Returns indices of bad rows from the early days of Site 0 (UCF)."""
    return X[(X.timestamp < 3378) & (X.site_id == 0) & (X.meter == 0)].index

def find_bad_building1099(X, y):
    """Returns indices of bad rows (with absurdly high readings) from building 1099."""
    return X[(X.building_id == 1099) & (X.meter == 2) & (y > 3e4)].index

def find_bad_rows(X, y):
    return find_bad_zeros(X, y).union(find_bad_sitezero(X)).union(find_bad_building1099(X, y))


In [6]:
# bad rows 삭제 3% 
train_df['timestamp'] = (train_df.timestamp - pd.to_datetime("2016-01-01")).dt.total_seconds() // 3600
rows = find_bad_rows(train_df.drop(['meter_reading'],axis =1 ),train_df['meter_reading'])
rows = list(rows)
train_df= train_df.drop(rows)

In [7]:
# drop_cols = ['date',"precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed","timestamp"]
drop_cols = ['date',"timestamp"]
target = train_df["meter_reading_log1p"]
del train_df["meter_reading"], train_df['meter_reading_log1p']
train_df = train_df.drop(drop_cols, axis=1)
drop_cols += ["row_id"]
# drop_cols.remove('date')
test_df = test_df.drop(drop_cols, axis=1)

In [8]:
train_df.head()
categorical_features = ["building_id", "site_id", "meter", "hour", "weekend",'month','is_holiday','primary_use']

In [9]:
leak_df = pd.read_pickle('leak_df.pkl')
leak_df = leak_df.dropna()


In [10]:
%%time
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from bayes_opt import BayesianOptimization
folds = 3
seed = 99 #666

# Hyperparameter range
params_range = {
                'num_leaves': (1000, 1500),
                'feature_fraction': (0.7, 0.9),
                'bagging_fraction': (0.8, 1),
                'max_depth': (10, 11),
                'lambda_l1': (0, 5),
                'lambda_l2': (0, 5),
                'min_split_gain': (0.001, 0.1),
                'min_child_weight': (5, 50),
                'learning_rate' : (.03,.07)
               }



Wall time: 1.28 s


In [None]:
%%time
paramlst = []
scorelst =[]
def running(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight, learning_rate):
    params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": int(round(num_leaves)),
    "learning_rate": learning_rate,
    "feature_fraction": max(min(feature_fraction,1),0),
    "bagging_fraction": max(min(bagging_fraction,1),0),
    "max_depth" : int(round(max_depth)),
    "lambda_l1" : max(lambda_l1,0),
    "lambda_l2" : max(lambda_l2,0),
    "min_split_gain": min_split_gain,
    "min_child_weight": min_child_weight,
    "metric": "rmse",
    "n_jobs" : 10
    }
    
    print(params)
    kf = KFold(n_splits=folds, shuffle=False, random_state=seed)
    scores = [] 
    models = []
    for train_index, val_index in kf.split(train_df):
        train_X = train_df.iloc[train_index]
    
        val_X = train_df.iloc[val_index]
        train_y = target.iloc[train_index]
        val_y = target.iloc[val_index]
        lgb_train = lgb.Dataset(train_X, train_y,categorical_feature=categorical_features)
        lgb_eval = lgb.Dataset(val_X, val_y,categorical_feature=categorical_features)
        gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=2000, #300,
                    valid_sets=(lgb_train, lgb_eval),
#                     feval=rmsle,
                    early_stopping_rounds= 50,#100,
                    verbose_eval=30) #100)
        models.append(gbm)
        del gbm
    
    i = 0
    res3 = []
    step_size = 50000
        
    for j in tqdm(range(int(np.ceil(test_df.shape[0] / 50000)))):
        res3.append([np.expm1(model.predict(test_df.iloc[i:i + step_size])) for model in models])
        i += step_size
        
    res4 = []
    print(len(res3))
    for k in range(len(res3)):
        res4.append((0.6*res3[k][0]+0.3*res3[k][1]+0.1*res3[k][2]))
#         if i==100 : break
    
    
    res4 = np.concatenate(res4)
    sample_submission = pd.read_csv('sample_submission.csv')
    sample_submission["meter_reading"] = res4
    sample_submission.loc[sample_submission['meter_reading'] < 0, 'meter_reading'] = 0

    sample_submission = pd.merge(sample_submission,leak_df.loc[:,['row_id','leak_meter_reading']],how ='inner')    
    score = np.sqrt(mean_squared_log_error( sample_submission['leak_meter_reading'], sample_submission['meter_reading'] ))
    print(score)
    scorelst.append(score)
    paramlst.append(params)
    
    df_para = pd.DataFrame({"parameter" : paramlst,"score": scorelst})
    df_para.to_csv('para.csv',index=False)
    
    return -np.sqrt(mean_squared_log_error( sample_submission['leak_meter_reading'], sample_submission['meter_reading'] ))        

    
Baysian = BayesianOptimization(running, params_range,random_state = seed)
Baysian.maximize(init_points = 5, n_iter = 20)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1495, 'learning_rate': 0.062321998534593914, 'feature_fraction': 0.7976156798481168, 'bagging_fraction': 0.9344557117261584, 'max_depth': 11, 'lambda_l1': 4.127475870179482, 'lambda_l2': 0.15723193813149072, 'min_split_gain': 0.005622876330188372, 'min_child_weight': 18.393012441677477, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.02813	valid_1's rmse: 1.17218
[60]	training's rmse: 0.738094	valid_1's rmse: 0.97515
[90]	training's rmse: 0.63781	valid_1's rmse: 0.917823
[120]	training's rmse: 0.610505	valid_1's rmse: 0.904778
[150]	training's rmse: 0.591082	valid_1's rmse: 0.897663
[180]	training's rmse: 0.575879	valid_1's rmse: 0.895148
[210]	training's rmse: 0.562561	valid_1's rmse: 0.892409
[240]	training's rmse: 0.550937	valid_1's rmse: 0.890725
[270]	training's rmse: 0.538694	valid_1's rmse: 0.889334
[300]	training's rmse: 0.528497	valid_1's rmse: 0.889293
[330]	training's rmse: 0.520405	valid_1's rmse: 0.888592
[360]	training's rmse: 0.513929	valid_1's rmse: 0.888774
[390]	training's rmse: 0.507213	valid_1's rmse: 0.889234
Early stopping, best iteration is:
[343]	training's rmse: 0.517548	valid_1's rmse: 0.888169
Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 0.918191	valid_1's rmse: 1.137

100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [38:30<00:00,  2.90s/it]


834
0.940564196078774
|  1        | -0.9406   |  0.9345   |  0.7976   |  4.127    |  0.1572   |  0.06232  |  10.57    |  18.39    |  0.005623 |  1.495e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1262, 'learning_rate': 0.049765898089639035, 'feature_fraction': 0.853958605637988, 'bagging_fraction': 0.801365146608731, 'max_depth': 11, 'lambda_l1': 3.7338355049714984, 'lambda_l2': 1.887194681674826, 'min_split_gain': 0.09742167338606704, 'min_child_weight': 22.795431995561913, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.10694	valid_1's rmse: 1.23534
[60]	training's rmse: 0.811533	valid_1's rmse: 1.02993
[90]	training's rmse: 0.666695	valid_1's rmse: 0.940186
[120]	training's rmse: 0.626887	valid_1's rmse: 0.913254
[150]	training's rmse: 0.602105	valid_1's rmse: 0.90202
[180]	training's rmse: 0.58686	valid_1's rmse: 0.895299
[210]	training's rmse: 0.573313	valid_1's rmse: 0.892221
[240]	training's rmse: 0.562689	valid_1's rmse: 0.88959
[270]	training's rmse: 0.553016	valid_1's rmse: 0.887797
[300]	training's rmse: 0.543656	valid_1's rmse: 0.88725
[330]	training's rmse: 0.535558	valid_1's rmse: 0.886433
[360]	training's rmse: 0.529057	valid_1's rmse: 0.885987
[390]	training's rmse: 0.522595	valid_1's rmse: 0.886422
[420]	training's rmse: 0.515928	valid_1's rmse: 0.886669
Early stopping, best iteration is:
[377]	training's rmse: 0.524908	valid_1's rmse: 0.885336
Training until validation scores don't improve for 50 round

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [1:03:29<00:00,  4.68s/it]


834
0.938524534882059
|  2        | -0.9385   |  0.8014   |  0.854    |  3.734    |  1.887    |  0.04977  |  10.93    |  22.8     |  0.09742  |  1.262e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1322, 'learning_rate': 0.04169076463836646, 'feature_fraction': 0.8626616825564258, 'bagging_fraction': 0.8187226186595505, 'max_depth': 11, 'lambda_l1': 1.0584339279468176, 'lambda_l2': 2.771728922953427, 'min_split_gain': 0.022936159801228193, 'min_child_weight': 42.26191547930651, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.17997	valid_1's rmse: 1.28687
[60]	training's rmse: 0.89107	valid_1's rmse: 1.08319
[90]	training's rmse: 0.716804	valid_1's rmse: 0.965566
[120]	training's rmse: 0.650414	valid_1's rmse: 0.922927
[150]	training's rmse: 0.622348	valid_1's rmse: 0.908752
[180]	training's rmse: 0.60361	valid_1's rmse: 0.898499
[210]	training's rmse: 0.589212	valid_1's rmse: 0.894139
[240]	training's rmse: 0.57705	valid_1's rmse: 0.891694
[270]	training's rmse: 0.567299	valid_1's rmse: 0.888712
[300]	training's rmse: 0.559252	valid_1's rmse: 0.886699
[330]	training's rmse: 0.551652	valid_1's rmse: 0.885421
[360]	training's rmse: 0.544138	valid_1's rmse: 0.884468
[390]	training's rmse: 0.538288	valid_1's rmse: 0.884132
[420]	training's rmse: 0.532252	valid_1's rmse: 0.883883
[450]	training's rmse: 0.52688	valid_1's rmse: 0.883815
[480]	training's rmse: 0.52248	valid_1's rmse: 0.883568
[510]	training's rmse: 0.517588	valid_

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [1:17:00<00:00,  5.66s/it]


834
0.939242984266318
|  3        | -0.9392   |  0.8187   |  0.8627   |  1.058    |  2.772    |  0.04169  |  10.82    |  42.26    |  0.02294  |  1.322e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1003, 'learning_rate': 0.038487847012622284, 'feature_fraction': 0.7823326477685877, 'bagging_fraction': 0.8190363243867061, 'max_depth': 10, 'lambda_l1': 0.48432630631333573, 'lambda_l2': 0.7200549945384233, 'min_split_gain': 0.024269334370928933, 'min_child_weight': 8.492622647000669, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.25805	valid_1's rmse: 1.35311
[60]	training's rmse: 0.984765	valid_1's rmse: 1.13961
[90]	training's rmse: 0.823332	valid_1's rmse: 1.0245
[120]	training's rmse: 0.736041	valid_1's rmse: 0.959175
[150]	training's rmse: 0.693064	valid_1's rmse: 0.934086
[180]	training's rmse: 0.671736	valid_1's rmse: 0.921217
[210]	training's rmse: 0.655046	valid_1's rmse: 0.91245
[240]	training's rmse: 0.64265	valid_1's rmse: 0.906788
[270]	training's rmse: 0.631715	valid_1's rmse: 0.901478
[300]	training's rmse: 0.620974	valid_1's rmse: 0.897152
[330]	training's rmse: 0.611321	valid_1's rmse: 0.893717
[360]	training's rmse: 0.602864	valid_1's rmse: 0.890163
[390]	training's rmse: 0.59519	valid_1's rmse: 0.887369
[420]	training's rmse: 0.588849	valid_1's rmse: 0.885915
[450]	training's rmse: 0.583134	valid_1's rmse: 0.884765
[480]	training's rmse: 0.577184	valid_1's rmse: 0.88346
[510]	training's rmse: 0.57171	valid_1'

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [2:10:10<00:00,  9.38s/it]


834
0.9396076355829647
|  4        | -0.9396   |  0.819    |  0.7823   |  0.4843   |  0.7201   |  0.03849  |  10.48    |  8.493    |  0.02427  |  1.003e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1400, 'learning_rate': 0.05168350795271688, 'feature_fraction': 0.8104468853755327, 'bagging_fraction': 0.9797288374144799, 'max_depth': 10, 'lambda_l1': 0.8377331385715175, 'lambda_l2': 4.644390917478679, 'min_split_gain': 0.06437411151091817, 'min_child_weight': 28.619457798290583, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.13188	valid_1's rmse: 1.244
[60]	training's rmse: 0.855466	valid_1's rmse: 1.03994
[90]	training's rmse: 0.715415	valid_1's rmse: 0.945762
[120]	training's rmse: 0.677357	valid_1's rmse: 0.920848
[150]	training's rmse: 0.652237	valid_1's rmse: 0.906888
[180]	training's rmse: 0.635299	valid_1's rmse: 0.899895
[210]	training's rmse: 0.619728	valid_1's rmse: 0.894135
[240]	training's rmse: 0.607772	valid_1's rmse: 0.890114
[270]	training's rmse: 0.595381	valid_1's rmse: 0.886913
[300]	training's rmse: 0.584408	valid_1's rmse: 0.885412
[330]	training's rmse: 0.574763	valid_1's rmse: 0.885331
[360]	training's rmse: 0.567608	valid_1's rmse: 0.88515
[390]	training's rmse: 0.55941	valid_1's rmse: 0.884274
[420]	training's rmse: 0.554455	valid_1's rmse: 0.8836
[450]	training's rmse: 0.550071	valid_1's rmse: 0.883845
Early stopping, best iteration is:
[416]	training's rmse: 0.555009	valid_1's rmse: 0.883405
Trai

100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [33:12<00:00,  2.54s/it]


834
0.9457913490442802
|  5        | -0.9458   |  0.9797   |  0.8104   |  0.8377   |  4.644    |  0.05168  |  10.04    |  28.62    |  0.06437  |  1.4e+03  |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1001, 'learning_rate': 0.03767574939024858, 'feature_fraction': 0.8758609171707527, 'bagging_fraction': 0.88043499503214, 'max_depth': 11, 'lambda_l1': 4.442130849147159, 'lambda_l2': 0.0880292622373563, 'min_split_gain': 0.0049632982075047145, 'min_child_weight': 49.92601050268472, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.21109	valid_1's rmse: 1.31278
[60]	training's rmse: 0.949953	valid_1's rmse: 1.12377
[90]	training's rmse: 0.760762	valid_1's rmse: 0.99023
[120]	training's rmse: 0.675412	valid_1's rmse: 0.934331
[150]	training's rmse: 0.637505	valid_1's rmse: 0.913724
[180]	training's rmse: 0.617071	valid_1's rmse: 0.903823
[210]	training's rmse: 0.600598	valid_1's rmse: 0.897436
[240]	training's rmse: 0.58858	valid_1's rmse: 0.893321
[270]	training's rmse: 0.578531	valid_1's rmse: 0.890817
[300]	training's rmse: 0.569136	valid_1's rmse: 0.889215
[330]	training's rmse: 0.561168	valid_1's rmse: 0.88685
[360]	training's rmse: 0.554352	valid_1's rmse: 0.885705
[390]	training's rmse: 0.548329	valid_1's rmse: 0.88489
[420]	training's rmse: 0.542127	valid_1's rmse: 0.884548
[450]	training's rmse: 0.537012	valid_1's rmse: 0.884416
[480]	training's rmse: 0.531627	valid_1's rmse: 0.884219
[510]	training's rmse: 0.526472	valid

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [1:19:05<00:00,  5.52s/it]


834
0.9398333786412235
|  6        | -0.9398   |  0.8804   |  0.8759   |  4.442    |  0.08803  |  0.03768  |  10.87    |  49.93    |  0.004963 |  1.001e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1135, 'learning_rate': 0.0416759858035137, 'feature_fraction': 0.8662654619340042, 'bagging_fraction': 0.9381946747088765, 'max_depth': 11, 'lambda_l1': 0.00689686139157597, 'lambda_l2': 0.19680698824551646, 'min_split_gain': 0.04733124191571435, 'min_child_weight': 47.17528463641185, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.17492	valid_1's rmse: 1.28505
[60]	training's rmse: 0.891954	valid_1's rmse: 1.08637
[90]	training's rmse: 0.715335	valid_1's rmse: 0.967862
[120]	training's rmse: 0.651592	valid_1's rmse: 0.927744
[150]	training's rmse: 0.620742	valid_1's rmse: 0.911904
[180]	training's rmse: 0.601785	valid_1's rmse: 0.902259
[210]	training's rmse: 0.586696	valid_1's rmse: 0.8979
[240]	training's rmse: 0.575419	valid_1's rmse: 0.894889
[270]	training's rmse: 0.566204	valid_1's rmse: 0.892097
[300]	training's rmse: 0.556742	valid_1's rmse: 0.890035
[330]	training's rmse: 0.548444	valid_1's rmse: 0.889067
[360]	training's rmse: 0.542021	valid_1's rmse: 0.888251
[390]	training's rmse: 0.535935	valid_1's rmse: 0.887987
[420]	training's rmse: 0.530455	valid_1's rmse: 0.887435
[450]	training's rmse: 0.525056	valid_1's rmse: 0.887426
[480]	training's rmse: 0.520159	valid_1's rmse: 0.887392
[510]	training's rmse: 0.51494	vali

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [1:14:54<00:00,  5.27s/it]


834
0.9375369163379721
|  7        | -0.9375   |  0.9382   |  0.8663   |  0.006897 |  0.1968   |  0.04168  |  10.7     |  47.18    |  0.04733  |  1.135e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1490, 'learning_rate': 0.030658898821068762, 'feature_fraction': 0.8908207659658284, 'bagging_fraction': 0.8815699216761337, 'max_depth': 11, 'lambda_l1': 4.977052154297511, 'lambda_l2': 0.28159624321490107, 'min_split_gain': 0.06324712381278111, 'min_child_weight': 49.6029475417167, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.29459	valid_1's rmse: 1.37756
[60]	training's rmse: 1.02691	valid_1's rmse: 1.18608
[90]	training's rmse: 0.839922	valid_1's rmse: 1.05035
[120]	training's rmse: 0.73178	valid_1's rmse: 0.976811
[150]	training's rmse: 0.665983	valid_1's rmse: 0.933722
[180]	training's rmse: 0.638508	valid_1's rmse: 0.916914
[210]	training's rmse: 0.62031	valid_1's rmse: 0.908837
[240]	training's rmse: 0.605722	valid_1's rmse: 0.901331
[270]	training's rmse: 0.594306	valid_1's rmse: 0.897161
[300]	training's rmse: 0.58439	valid_1's rmse: 0.894083
[330]	training's rmse: 0.575438	valid_1's rmse: 0.890976
[360]	training's rmse: 0.567282	valid_1's rmse: 0.889001
[390]	training's rmse: 0.560518	valid_1's rmse: 0.887885
[420]	training's rmse: 0.554505	valid_1's rmse: 0.886666
[450]	training's rmse: 0.549459	valid_1's rmse: 0.885864
[480]	training's rmse: 0.544955	valid_1's rmse: 0.885583
[510]	training's rmse: 0.540524	valid_

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [2:34:51<00:00, 10.74s/it]


834
0.9397269730211908
|  8        | -0.9397   |  0.8816   |  0.8908   |  4.977    |  0.2816   |  0.03066  |  10.95    |  49.6     |  0.06325  |  1.49e+03 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1128, 'learning_rate': 0.05656061731618453, 'feature_fraction': 0.8468926923368423, 'bagging_fraction': 0.8134338039975106, 'max_depth': 11, 'lambda_l1': 4.935340980501071, 'lambda_l2': 0.030388220246421738, 'min_split_gain': 0.027969248938944508, 'min_child_weight': 8.442946568625445, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.06661	valid_1's rmse: 1.20404
[60]	training's rmse: 0.765491	valid_1's rmse: 0.995531
[90]	training's rmse: 0.649702	valid_1's rmse: 0.924187
[120]	training's rmse: 0.616476	valid_1's rmse: 0.90795
[150]	training's rmse: 0.595488	valid_1's rmse: 0.900103
[180]	training's rmse: 0.580104	valid_1's rmse: 0.89565
[210]	training's rmse: 0.56718	valid_1's rmse: 0.891942
[240]	training's rmse: 0.556841	valid_1's rmse: 0.88984
[270]	training's rmse: 0.547117	valid_1's rmse: 0.888752
[300]	training's rmse: 0.538694	valid_1's rmse: 0.888174
[330]	training's rmse: 0.530071	valid_1's rmse: 0.887487
[360]	training's rmse: 0.522618	valid_1's rmse: 0.887643
[390]	training's rmse: 0.515387	valid_1's rmse: 0.888772
Early stopping, best iteration is:
[342]	training's rmse: 0.526787	valid_1's rmse: 0.88734
Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 0.935017	valid_1's rmse: 1.14869


100%|████████████████████████████████████████████████████████████████████████████████| 834/834 [45:16<00:00,  3.26s/it]


834
0.9409301741921224
|  9        | -0.9409   |  0.8134   |  0.8469   |  4.935    |  0.03039  |  0.05656  |  10.52    |  8.443    |  0.02797  |  1.128e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1000, 'learning_rate': 0.04258386387576778, 'feature_fraction': 0.7312725493563645, 'bagging_fraction': 0.9034990665974153, 'max_depth': 11, 'lambda_l1': 4.6757111554145805, 'lambda_l2': 4.810640356992706, 'min_split_gain': 0.030921679303981802, 'min_child_weight': 49.641674653060726, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.19109	valid_1's rmse: 1.30201
[60]	training's rmse: 0.917928	valid_1's rmse: 1.09836
[90]	training's rmse: 0.763026	valid_1's rmse: 0.987908
[120]	training's rmse: 0.692725	valid_1's rmse: 0.940929
[150]	training's rmse: 0.655592	valid_1's rmse: 0.918016
[180]	training's rmse: 0.635941	valid_1's rmse: 0.907068
[210]	training's rmse: 0.617087	valid_1's rmse: 0.899547
[240]	training's rmse: 0.60541	valid_1's rmse: 0.895319
[270]	training's rmse: 0.593996	valid_1's rmse: 0.891553
[300]	training's rmse: 0.584919	valid_1's rmse: 0.889331
[330]	training's rmse: 0.575719	valid_1's rmse: 0.887144
[360]	training's rmse: 0.568296	valid_1's rmse: 0.886273
[390]	training's rmse: 0.560642	valid_1's rmse: 0.884456
[420]	training's rmse: 0.555815	valid_1's rmse: 0.883716
[450]	training's rmse: 0.550607	valid_1's rmse: 0.883822
[480]	training's rmse: 0.544643	valid_1's rmse: 0.883033
[510]	training's rmse: 0.538913	va

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [2:08:35<00:00,  8.93s/it]


834
0.9393030286366488
|  10       | -0.9393   |  0.9035   |  0.7313   |  4.676    |  4.811    |  0.04258  |  10.83    |  49.64    |  0.03092  |  1e+03    |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1001, 'learning_rate': 0.04372621511985842, 'feature_fraction': 0.7164246779277352, 'bagging_fraction': 0.8932841595078215, 'max_depth': 10, 'lambda_l1': 1.7123541492555472, 'lambda_l2': 0.06846985953308071, 'min_split_gain': 0.032987110806922314, 'min_child_weight': 49.898248303711505, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.22586	valid_1's rmse: 1.3199
[60]	training's rmse: 0.955623	valid_1's rmse: 1.11701
[90]	training's rmse: 0.792223	valid_1's rmse: 1.00286
[120]	training's rmse: 0.730393	valid_1's rmse: 0.959361
[150]	training's rmse: 0.692696	valid_1's rmse: 0.936097
[180]	training's rmse: 0.674393	valid_1's rmse: 0.925524
[210]	training's rmse: 0.656972	valid_1's rmse: 0.918949
[240]	training's rmse: 0.643934	valid_1's rmse: 0.913307
[270]	training's rmse: 0.633367	valid_1's rmse: 0.909798
[300]	training's rmse: 0.623268	valid_1's rmse: 0.906124
[330]	training's rmse: 0.614508	valid_1's rmse: 0.902923
[360]	training's rmse: 0.604958	valid_1's rmse: 0.899573
[390]	training's rmse: 0.596826	valid_1's rmse: 0.898078
[420]	training's rmse: 0.590098	valid_1's rmse: 0.896336
[450]	training's rmse: 0.584492	valid_1's rmse: 0.895395
[480]	training's rmse: 0.577917	valid_1's rmse: 0.894385
[510]	training's rmse: 0.571284	val

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [1:24:09<00:00,  6.03s/it]


834
0.9401010546569719
|  11       | -0.9401   |  0.8933   |  0.7164   |  1.712    |  0.06847  |  0.04373  |  10.15    |  49.9     |  0.03299  |  1.001e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1000, 'learning_rate': 0.05315624265454305, 'feature_fraction': 0.8645494468247317, 'bagging_fraction': 0.8142576624681892, 'max_depth': 10, 'lambda_l1': 2.710162742911689, 'lambda_l2': 3.3817819752120175, 'min_split_gain': 0.0030006954085962942, 'min_child_weight': 49.99199397153941, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.125	valid_1's rmse: 1.24491
[60]	training's rmse: 0.827407	valid_1's rmse: 1.03008
[90]	training's rmse: 0.69994	valid_1's rmse: 0.937674
[120]	training's rmse: 0.663527	valid_1's rmse: 0.918399
[150]	training's rmse: 0.639254	valid_1's rmse: 0.906009
[180]	training's rmse: 0.621347	valid_1's rmse: 0.899205
[210]	training's rmse: 0.608793	valid_1's rmse: 0.895658
[240]	training's rmse: 0.596841	valid_1's rmse: 0.892434
[270]	training's rmse: 0.587163	valid_1's rmse: 0.890297
[300]	training's rmse: 0.579102	valid_1's rmse: 0.889488
[330]	training's rmse: 0.570421	valid_1's rmse: 0.888261
[360]	training's rmse: 0.563601	valid_1's rmse: 0.887981
[390]	training's rmse: 0.55626	valid_1's rmse: 0.887462
[420]	training's rmse: 0.549433	valid_1's rmse: 0.886517
[450]	training's rmse: 0.545175	valid_1's rmse: 0.886056
[480]	training's rmse: 0.539746	valid_1's rmse: 0.885932
[510]	training's rmse: 0.535134	valid

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [1:15:40<00:00,  5.33s/it]


834
0.9395617217426856
|  12       | -0.9396   |  0.8143   |  0.8645   |  2.71     |  3.382    |  0.05316  |  10.48    |  49.99    |  0.003001 |  1e+03    |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1001, 'learning_rate': 0.03042792458116519, 'feature_fraction': 0.7060691568444786, 'bagging_fraction': 0.9796043288930394, 'max_depth': 11, 'lambda_l1': 3.1014191538011877, 'lambda_l2': 0.8070015241472511, 'min_split_gain': 0.05456038021121535, 'min_child_weight': 49.66456069292796, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.32371	valid_1's rmse: 1.40448
[60]	training's rmse: 1.05117	valid_1's rmse: 1.19247
[90]	training's rmse: 0.879284	valid_1's rmse: 1.06768
[120]	training's rmse: 0.79897	valid_1's rmse: 1.01198
[150]	training's rmse: 0.726561	valid_1's rmse: 0.964357
[180]	training's rmse: 0.68679	valid_1's rmse: 0.938854
[210]	training's rmse: 0.656732	valid_1's rmse: 0.92142
[240]	training's rmse: 0.640978	valid_1's rmse: 0.911715
[270]	training's rmse: 0.629627	valid_1's rmse: 0.906744
[300]	training's rmse: 0.619124	valid_1's rmse: 0.901989
[330]	training's rmse: 0.607994	valid_1's rmse: 0.897115
[360]	training's rmse: 0.599543	valid_1's rmse: 0.8938
[390]	training's rmse: 0.592003	valid_1's rmse: 0.891411
[420]	training's rmse: 0.586443	valid_1's rmse: 0.889763
[450]	training's rmse: 0.581892	valid_1's rmse: 0.888058
[480]	training's rmse: 0.576726	valid_1's rmse: 0.886504
[510]	training's rmse: 0.571719	valid_1's

100%|██████████████████████████████████████████████████████████████████████████████| 834/834 [2:53:21<00:00, 12.28s/it]


834
0.9398112731176935
|  13       | -0.9398   |  0.9796   |  0.7061   |  3.101    |  0.807    |  0.03043  |  10.64    |  49.66    |  0.05456  |  1.001e+0 |
{'objective': 'regression', 'boosting': 'gbdt', 'num_leaves': 1000, 'learning_rate': 0.055861921192162954, 'feature_fraction': 0.7289323188494812, 'bagging_fraction': 0.8332483904053708, 'max_depth': 11, 'lambda_l1': 4.803761551265161, 'lambda_l2': 1.1019391559327336, 'min_split_gain': 0.0040384139597368085, 'min_child_weight': 49.624027452225434, 'metric': 'rmse', 'n_jobs': 10}




Training until validation scores don't improve for 50 rounds
[30]	training's rmse: 1.08466	valid_1's rmse: 1.2262
[60]	training's rmse: 0.809111	valid_1's rmse: 1.02025
[90]	training's rmse: 0.683283	valid_1's rmse: 0.938566
[120]	training's rmse: 0.645469	valid_1's rmse: 0.919553
[150]	training's rmse: 0.620703	valid_1's rmse: 0.908019
[180]	training's rmse: 0.606488	valid_1's rmse: 0.901433
[210]	training's rmse: 0.59244	valid_1's rmse: 0.897098
[240]	training's rmse: 0.580976	valid_1's rmse: 0.894933
[270]	training's rmse: 0.571205	valid_1's rmse: 0.892773
[300]	training's rmse: 0.561024	valid_1's rmse: 0.89143
[330]	training's rmse: 0.551261	valid_1's rmse: 0.890577
[360]	training's rmse: 0.542877	valid_1's rmse: 0.890116
[390]	training's rmse: 0.535084	valid_1's rmse: 0.88984
[420]	training's rmse: 0.530216	valid_1's rmse: 0.889657
[450]	training's rmse: 0.525496	valid_1's rmse: 0.889884
Early stopping, best iteration is:
[423]	training's rmse: 0.529695	valid_1's rmse: 0.889527
Tr

In [12]:
# def lgb_best_params(X, y,opt_params,init_points=2, optimization_round=20, n_folds=3, random_seed=0, cv_estimators=1000):
    
#     # prepare dataset
#     categorical_features = ["building_id", "site_id", "meter", "primary_use", "is_holiday", "weekend"]
#     train_data = lgb.Dataset(data=X, label=y, categorical_feature = categorical_features, free_raw_data=False)
    
#     def lgb_run(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight,learning_rate):
#         params = {"boosting": "gbdt",'application':'regression','num_iterations':cv_estimators, 'early_stopping_round':int(cv_estimators/5), 'metric':'rmse'}
#         params["num_leaves"] = int(round(num_leaves))
#         params['feature_fraction'] = max(min(feature_fraction, 1), 0)
#         params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
#         params['max_depth'] = int(round(max_depth))
#         params['lambda_l1'] = max(lambda_l1, 0)
#         params['lambda_l2'] = max(lambda_l2, 0)
#         params['min_split_gain'] = min_split_gain
#         params['min_child_weight'] = min_child_weight
#         params['learning_rate'] = learning_rate
#         cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, verbose_eval =cv_estimators, metrics=['rmse'])
#         return -min(cv_result['rmse-mean'])
    
#     params_finder = BayesianOptimization(lgb_run, opt_params, random_state=2021)
#     # optimize
#     params_finder.maximize(init_points=init_points, n_iter=optimization_round)

#     # return best parameters
#     return params_finder.max

In [13]:
# #  number of fold
# fold = 3

# # Hyperparameter range
# params_range = {
#                 'num_leaves': (1000, 1280),
#                 'feature_fraction': (0.7, 0.9),
#                 'bagging_fraction': (0.8, 1),
#                 'max_depth': (10, 11),
#                 'lambda_l1': (2, 5),
#                 'lambda_l2': (2, 5),
#                 'min_split_gain': (0.001, 0.1),
#                 'min_child_weight': (5, 50),
#                 'learning_rate' : (.05,.07)
#                }

# # You can experiments with different estimators in a single execution. I'm using small numbers for demonstration purpose so you must change to high numbers.
# cv_estimators = [50,100,200]

# #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
# optimization_round = 10 # Simply, 10 models with different parameter will be tested. And the best one will be returned.

# #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
# init_points = 2

# random_seed = 2010

In [14]:
# best_params= []
# for cv_estimator in cv_estimators:
#     opt_params = lgb_best_params(features, target,params_range, init_points=init_points, optimization_round=optimization_round, n_folds=fold, random_seed=random_seed, cv_estimators=cv_estimator)
#     opt_params['params']['iteration'] = cv_estimator
#     opt_params['params']['fold'] = fold
#     opt_params['params']['rmse'] = opt_params['target']
#     best_params.append(opt_params['params'])

In [15]:
# df_params = pd.DataFrame(best_params).reset_index()
# df_params = df_params[['iteration','fold','num_leaves','learning_rate','bagging_fraction',
#  'feature_fraction',
#  'lambda_l1',
#  'lambda_l2',
#  'max_depth',
#  'min_child_weight',
#  'min_split_gain',
#  'rmse']]
# df_params.to_csv('best_params.csv')