In [1]:
import numpy as np
import pandas as pd
import gc, warnings, random, math
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 35)

In [2]:
SEED = 17
TARGET = 'meter_reading'
random.seed(SEED)
np.random.seed(SEED)

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [3]:
df_train = pd.read_pickle('df_train.pkl')
df_test = pd.read_pickle('df_test.pkl')

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
le = LabelEncoder()
df_train['primary_use'] = df_train['primary_use'].astype(str)
df_train['primary_use'] = le.fit_transform(df_train['primary_use']).astype(np.int8)

In [6]:
for df in [df_train, df_test]:
    
    df['DT_D'] = df['timestamp'].dt.dayofyear.astype(np.int16)
    df['DT_M'] = df['timestamp'].dt.month.astype(np.int8)
    df['DT_hour'] = df['timestamp'].dt.hour.astype(np.int8)
    df["weekday"] = df['timestamp'].dt.weekday

In [7]:
df_train.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,DT_D,DT_M,DT_hour,weekday
0,0,0,2016-01-01,0.0,0,0,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,1,1,0,4
1,1,0,2016-01-01,0.0,0,0,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,1,1,0,4
2,2,0,2016-01-01,0.0,0,0,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,1,1,0,4
3,3,0,2016-01-01,0.0,0,0,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,1,1,0,4
4,4,0,2016-01-01,0.0,0,0,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0,1,1,0,4


In [8]:
temp = df_test[~df_test['building_id'].isin(df_train['building_id'])]
print(f"{len(temp)}: Buildings are exclusive in test set")
temp = df_test[~df_test['site_id'].isin(df_train['site_id'])]
print(f"{len(temp)}: Sites are exclusive in test set")
del temp, df_test
gc.collect()

0: Buildings are exclusive in test set
0: Sites are exclusive in test set


33

## Creating Holdout sets

In [9]:
#1st Holdout set
# Split the train set by site_id -> ~20% in the validation set
train,test = train_test_split(df_train['site_id'].unique(), test_size=0.2, random_state=SEED)
holdout_set1 = df_train[df_train['site_id'].isin(test)].reset_index(drop=True)
df_train = df_train[df_train['site_id'].isin(train)].reset_index(drop=True)

#2nd Holdout set
#Split the train set by building_id -> ~20% in the validation set
train,test = train_test_split(df_train['building_id'].unique(), test_size=0.2, random_state=SEED)
holdout_set2 = df_train[df_train['building_id'].isin(test)].reset_index(drop=True)
df_train = df_train[df_train['building_id'].isin(train)].reset_index(drop=True)

#3rd Holdout set
#Split the train set by month -> First and Last Month in holdout set
holdout_set3 = df_train[(df_train['DT_M']==1) | (df_train['DT_M'] ==12)].reset_index(drop=True)
df_train = df_train[(df_train['DT_M']!=1) & (df_train['DT_M']) !=12].reset_index(drop=True)

gc.collect()

for df in [df_train, holdout_set1, holdout_set2, holdout_set3]:
    df['meter_reading'] = np.log1p(df['meter_reading'])
    print(df.shape)

(11100638, 20)
(6093592, 20)
(3021870, 20)
(1886471, 20)


In [10]:
remove_columns = ['timestamp', 'site_id', 'building_id', TARGET]
features_columns = [col for col in list(df_train) if col not in remove_columns]

X = df_train[features_columns]
y = df_train[TARGET]

split_by_building = df_train['building_id']
split_by_site = df_train['site_id']
split_by_month = df_train['DT_M']

del df_train
gc.collect()

22

In [11]:
READING_1 = holdout_set1[[TARGET]]
READING_2 = holdout_set2[[TARGET]]
READING_3 = holdout_set3[[TARGET]]

all_readings = {
    1:[READING_1, holdout_set1, 'site_id_hold_out'],
    2:[READING_2, holdout_set2, 'build_id_hold_out'],
    3:[READING_3, holdout_set3, 'month_hold_out']
}

In [12]:
for _,df in all_readings.items():
    df[0]['test'] = 0    
    print('Ground RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['test']))
    del df[0]['test']
    print('#'*50) 

Ground RMSE for site_id_hold_out | 4.079004001648642
##################################################
Ground RMSE for build_id_hold_out | 4.852578694256156
##################################################
Ground RMSE for month_hold_out | 4.891736616059323
##################################################


In [13]:
## WE will be using same number of splits for traning the model
N_SPLITS = 5

In [14]:
import lightgbm as lgb

In [15]:
params = {
                    'objective':'regression',
                    'boosting_type':'gbdt',
                    'metric':'rmse',
                    'n_jobs':-1,
                    'learning_rate':0.3, #for faster training
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':1000,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

In [19]:
#Trying Multiple bossting round to find the best among them
for n_rounds in [25,50,100,200]:
    
    print('#'*50)
    print('No Validation training...', n_rounds, 'boosting rounds')
    corrected_lgb_params = params.copy()
    corrected_lgb_params['n_estimators'] = n_rounds
    corrected_lgb_params['early_stopping_rounds'] = None

    train_data = lgb.Dataset(X, label=y)
    
    estimator = lgb.train(
                corrected_lgb_params,
                train_data
            )
    gc.collect()
    for _,df in all_readings.items():
        df[0]['no_validation_'+str(n_rounds)] = estimator.predict(df[1][features_columns])
        print('RMSE for',
              df[2], '|',
              rmse(df[0][TARGET], df[0]['no_validation_'+str(n_rounds)]))
        print('#'*20)

##################################################
No Validation training... 25 boosting rounds
RMSE for site_id_hold_out | 1.8034363811658465
####################
RMSE for build_id_hold_out | 1.8356204160147487
####################
RMSE for month_hold_out | 1.019487394171583
####################
##################################################
No Validation training... 50 boosting rounds
RMSE for site_id_hold_out | 1.8214261174575703
####################
RMSE for build_id_hold_out | 1.8588791429308105
####################
RMSE for month_hold_out | 0.9110584084338825
####################
##################################################
No Validation training... 100 boosting rounds
RMSE for site_id_hold_out | 1.847634791180678
####################
RMSE for build_id_hold_out | 1.9018452064427318
####################
RMSE for month_hold_out | 0.7692495781105358
####################
##################################################
No Validation training... 200 boosting rounds
RMSE fo

__Results suggest there is data leakage by in our dataset__ 
* leakage by Site_ID ->> Model Performs poorly on unknown sites hence doesn't generalize well 
* leakage by Building_IX ->> Model Performs poorly on unknown buildings hence doesn't generalize well 

Luckily our train set has same site and building ids as that of test set. Probably we don't need to smooth differences between them and can even make differences more explicit.
So, a good CV set should have all buildings and site information as bulidings and sites have very unqiue energy consumption.

In [20]:
gc.collect()

112

In [29]:
print('#'*20)
print('KFold (with shuffle) training...')

from sklearn.model_selection import KFold
folds = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for _,df in all_readings.items():
    df[0]['shuffle_kfold'] = 0
        
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )
    
    for _,df in all_readings.items():
        df[0]['shuffle_kfold'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_readings.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['shuffle_kfold']))
    print('#'*20) 

####################
KFold (with shuffle) training...
Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.840837	valid_1's rmse: 0.847049
[200]	training's rmse: 0.740143	valid_1's rmse: 0.752516
[300]	training's rmse: 0.691265	valid_1's rmse: 0.709239
[400]	training's rmse: 0.661194	valid_1's rmse: 0.684618
[500]	training's rmse: 0.641667	valid_1's rmse: 0.67071
[600]	training's rmse: 0.626189	valid_1's rmse: 0.660287
[700]	training's rmse: 0.612522	valid_1's rmse: 0.651362
[800]	training's rmse: 0.600321	valid_1's rmse: 0.643619
[900]	training's rmse: 0.591052	valid_1's rmse: 0.638665
[1000]	training's rmse: 0.581723	valid_1's rmse: 0.633969
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.581723	valid_1's rmse: 0.633969
Fold: 2
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.839413	valid_1's rmse: 0.844919
[200]	training's rmse: 0.76023	valid_1's rmse: 0.771301
[300]	training's 

__Note: Changing the number of splits 5 to 3, as it is taking quite a bit of time, but will use 5-Fold for the traning the final model__

In [18]:
print('#'*20)
print('KFold (no shuffle) training...')

from sklearn.model_selection import KFold
folds = KFold(n_splits=N_SPLITS, shuffle=False, random_state=SEED)

for _,df in all_readings.items():
    df[0]['no_shuffle_kfold'] = 0
        
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )
    
    for _,df in all_readings.items():
        df[0]['no_shuffle_kfold'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_readings.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['no_shuffle_kfold']))
    print('#'*20) 

####################
KFold (no shuffle) training...
Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.822771	valid_1's rmse: 1.14989
[200]	training's rmse: 0.733507	valid_1's rmse: 1.11497
[300]	training's rmse: 0.685091	valid_1's rmse: 1.10126
[400]	training's rmse: 0.657216	valid_1's rmse: 1.09781
[500]	training's rmse: 0.636075	valid_1's rmse: 1.09798
Early stopping, best iteration is:
[413]	training's rmse: 0.653841	valid_1's rmse: 1.09713
Fold: 2
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.819684	valid_1's rmse: 1.29443
[200]	training's rmse: 0.729502	valid_1's rmse: 1.27238
[300]	training's rmse: 0.677533	valid_1's rmse: 1.26768
[400]	training's rmse: 0.649741	valid_1's rmse: 1.26828
Early stopping, best iteration is:
[302]	training's rmse: 0.677133	valid_1's rmse: 1.26754
Fold: 3
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.850857	valid_1's rmse: 1.18

__Note: Data Leakage by month: Consumption differ from month to month on a quite significant magnitude, hence we cannot exclude any data by month.__

In [19]:
print('#'*20)
print('GroupKFold site_id split training...') 

from sklearn.model_selection import GroupKFold
folds = GroupKFold(n_splits=N_SPLITS)

for _,df in all_readings.items():
    df[0]['Groupkfold_by_site'] = 0
      
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_by_site)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )

    for _,df in all_readings.items():
        df[0]['Groupkfold_by_site'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_readings.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['Groupkfold_by_site']))
    print('#'*20)  

####################
GroupKFold site_id split training...
Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.689716	valid_1's rmse: 2.46855
Early stopping, best iteration is:
[4]	training's rmse: 1.50392	valid_1's rmse: 2.21927
Fold: 2
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.76513	valid_1's rmse: 2.45799
Early stopping, best iteration is:
[6]	training's rmse: 1.35218	valid_1's rmse: 2.34451
Fold: 3
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.818165	valid_1's rmse: 2.01695
Early stopping, best iteration is:
[2]	training's rmse: 1.95204	valid_1's rmse: 1.59192
Fold: 4
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.871039	valid_1's rmse: 2.18259
Early stopping, best iteration is:
[1]	training's rmse: 2.11216	valid_1's rmse: 1.8382
Fold: 5
Training until validation scores don't improve for 100 rounds
[100]	training's r

In [20]:
print('#'*20)
print('GroupKFold building_id split training...') 

from sklearn.model_selection import GroupKFold
folds = GroupKFold(n_splits=N_SPLITS)

for _,df in all_readings.items():
    df[0]['Groupkfold_by_building'] = 0
      
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_by_building)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )

    for _,df in all_readings.items():
        df[0]['Groupkfold_by_building'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_readings.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['Groupkfold_by_building']))
    print('#'*20)  

####################
GroupKFold building_id split training...
Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.793632	valid_1's rmse: 1.97884
Early stopping, best iteration is:
[9]	training's rmse: 1.28488	valid_1's rmse: 1.89974
Fold: 2
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.799639	valid_1's rmse: 2.01162
Early stopping, best iteration is:
[20]	training's rmse: 1.08446	valid_1's rmse: 1.96904
Fold: 3
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.794838	valid_1's rmse: 1.93188
Early stopping, best iteration is:
[6]	training's rmse: 1.39585	valid_1's rmse: 1.8236
Fold: 4
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.78317	valid_1's rmse: 1.93792
Early stopping, best iteration is:
[20]	training's rmse: 1.09354	valid_1's rmse: 1.84784
Fold: 5
Training until validation scores don't improve for 100 rounds
[100]	traini

In [21]:
print('#'*20)
print('GroupKFold month split training...') 

from sklearn.model_selection import GroupKFold
folds = GroupKFold(n_splits=N_SPLITS)

for _,df in all_readings.items():
    df[0]['Groupkfold_by_month'] = 0
      
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_by_month)):
    print('Fold:',fold_+1)
    tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]    
    vl_x, v_y = X.iloc[val_idx,:], y[val_idx]    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)  

    estimator = lgb.train(
            params,
            train_data,
            valid_sets = [train_data, valid_data],
            verbose_eval = 100,
        )

    for _,df in all_readings.items():
        df[0]['Groupkfold_by_month'] += estimator.predict(df[1][features_columns])/N_SPLITS

for _,df in all_readings.items():
    print('RMSE for', df[2], '|',
          rmse(df[0][TARGET], df[0]['Groupkfold_by_month']))
    print('#'*20)

####################
GroupKFold month split training...
Fold: 1
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.82462	valid_1's rmse: 1.19126
[200]	training's rmse: 0.724895	valid_1's rmse: 1.14063
[300]	training's rmse: 0.68016	valid_1's rmse: 1.12515
[400]	training's rmse: 0.653834	valid_1's rmse: 1.1187
[500]	training's rmse: 0.632643	valid_1's rmse: 1.11052
[600]	training's rmse: 0.615497	valid_1's rmse: 1.10501
[700]	training's rmse: 0.600371	valid_1's rmse: 1.10254
[800]	training's rmse: 0.586894	valid_1's rmse: 1.10002
[900]	training's rmse: 0.577362	valid_1's rmse: 1.09844
[1000]	training's rmse: 0.565809	valid_1's rmse: 1.09642
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.565809	valid_1's rmse: 1.09642
Fold: 2
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.815783	valid_1's rmse: 1.19848
[200]	training's rmse: 0.737851	valid_1's rmse: 1.17847
[300]	training's rmse: 0.6865

### Findings:

* Dataleakage is preventing model to generalize well.
* The best way to divide the data will be on the month basis, using 10 months for traning and 2 months for validation
* Timesatamp has to be fixed
* More boosting rounds with early stopping is to be used
* Have to train several models on different SEEDS and have to average them out

Train set - first 4 month
Skip - next 4 month
Valid set - last 4 month