## Training Model

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.svm import SVR
import gc
import os
import random
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm
%matplotlib inline
import seaborn as sns
from time import time
from datetime import datetime
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
import math
from sklearn.externals import joblib
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from joblib import parallel_backend
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import cross_val_score

## Loading Data

In [3]:
%%time
#weather data
weather_train = pd.read_csv("weather_train.csv")
weather_test = pd.read_csv("weather_test.csv")
#building data
building = pd.read_csv("building_metadata.csv")
#train and test data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

CPU times: user 24.9 s, sys: 1.97 s, total: 26.8 s
Wall time: 23.3 s


## Reduce Memory

In [4]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
weather_train = reduce_mem_usage(weather_train)
weather_test = reduce_mem_usage(weather_test)
building = reduce_mem_usage(building)
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)
Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)


## Merging tables

In [6]:
%%time
train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')

train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')
del weather_train, weather_test, building

CPU times: user 43.3 s, sys: 7.26 s, total: 50.5 s
Wall time: 26 s


In [7]:
train.shape

(20216100, 16)

In [8]:
test.shape

(41697600, 16)

## Data Preprocessing

### Missing Value

In [9]:
missing = pd.DataFrame(train.isna().sum()/len(train),columns=["Train_Missing_Pct"])
missing["Test_Missing_Pct"] = test.isna().sum()/len(test)
missing

Unnamed: 0,Train_Missing_Pct,Test_Missing_Pct
building_id,0.0,0.0
meter,0.0,0.0
timestamp,0.0,0.0
meter_reading,0.0,
site_id,0.0,0.0
primary_use,0.0,0.0
square_feet,0.0,0.0
year_built,0.5999,0.589916
floor_count,0.826528,0.82605
air_temperature,0.004781,0.005322


In [10]:
%%time
#replace other missing values with median value
train_mean = train.sample(80000).median()
train = train.fillna(train_mean)
test_mean = test.sample(80000).median()
test = test.fillna(test_mean)

CPU times: user 9.57 s, sys: 1.64 s, total: 11.2 s
Wall time: 8.03 s


In [11]:
#check if all missing values are covered
missing = pd.DataFrame(train.isna().sum()/len(train),columns=["Train_Missing_Pct"])
missing["Test_Missing_Pct"] = test.isna().sum()/len(test)
missing

Unnamed: 0,Train_Missing_Pct,Test_Missing_Pct
building_id,0.0,0.0
meter,0.0,0.0
timestamp,0.0,0.0
meter_reading,0.0,
site_id,0.0,0.0
primary_use,0.0,0.0
square_feet,0.0,0.0
year_built,0.0,0.0
floor_count,0.0,0.0
air_temperature,0.0,0.0


### Holiday

In [12]:
train.timestamp = pd.to_datetime(train.timestamp, format="%Y-%m-%d %H:%M:%S")
test.timestamp = pd.to_datetime(test.timestamp, format="%Y-%m-%d %H:%M:%S")

In [13]:
holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
            "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
            "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
            "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
            "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
            "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
            "2019-01-01"]

In [14]:
train["is_holiday"] = (train.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
test["is_holiday"] = (test.timestamp.dt.date.astype("str").isin(holidays)).astype(int)

### Timestamep 

In [15]:
%%time
#train dataset
train["month"] = pd.to_datetime(train.timestamp).dt.month
train["day"] = pd.to_datetime(train.timestamp).dt.day
train["weekday"] = pd.to_datetime(train.timestamp).dt.weekday
train["hour"] = pd.to_datetime(train.timestamp).dt.hour

CPU times: user 6.42 s, sys: 1.27 s, total: 7.69 s
Wall time: 2.82 s


In [16]:
train['year_built'] = train['year_built'].max() - train['year_built'] + 1
train['square_feet'] = np.log(train['square_feet'])

In [17]:
%%time
#test dataset
test["month"] = pd.to_datetime(test.timestamp).dt.month
test["day"] = pd.to_datetime(test.timestamp).dt.day
test["weekday"] = pd.to_datetime(test.timestamp).dt.weekday
test["hour"] = pd.to_datetime(test.timestamp).dt.hour

CPU times: user 9.68 s, sys: 1.76 s, total: 11.4 s
Wall time: 5.72 s


In [18]:
test['year_built'] = test['year_built'].max() - test['year_built'] + 1
test['square_feet'] = np.log(test['square_feet'])

In [19]:
test.head(100)

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,is_holiday,month,day,weekday,hour
0,0,0,0,2017-01-01,0,Education,8.913550,10.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
1,1,1,0,2017-01-01,0,Education,7.908387,14.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
2,2,2,0,2017-01-01,0,Education,8.589700,27.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
3,3,3,0,2017-01-01,0,Education,10.072597,16.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
4,4,4,0,2017-01-01,0,Education,11.666565,43.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
5,5,5,0,2017-01-01,0,Education,8.987197,18.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
6,6,6,0,2017-01-01,0,Lodging/residential,10.237313,37.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
7,7,7,0,2017-01-01,0,Education,11.704157,29.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
8,8,7,1,2017-01-01,0,Education,11.704157,29.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0
9,9,8,0,2017-01-01,0,Education,11.015493,15.0,3.0,17.796875,...,11.703125,0.0,1021.5,100.0,3.599609,1,1,1,6,0


### Weather Data

In [20]:
train.drop(columns=["wind_direction"], axis=1, inplace=True)
test.drop(columns=["wind_direction"], axis=1, inplace=True)

In [21]:
train.drop(columns=["sea_level_pressure"], axis=1, inplace=True)
test.drop(columns=["sea_level_pressure"], axis=1, inplace=True)

In [22]:
train.drop(columns=["wind_speed"], axis=1, inplace=True)
test.drop(columns=["wind_speed"], axis=1, inplace=True)

### Categorical Variables

In [23]:
train["primary_use"].value_counts()

Education                        8165504
Office                           4394864
Entertainment/public assembly    2264917
Lodging/residential              2146413
Public services                  1662826
Healthcare                        398527
Other                             242222
Parking                           213796
Manufacturing/industrial          125713
Food sales and service            114090
Retail                            112657
Warehouse/storage                 111861
Services                           96519
Technology/science                 77627
Utility                            56203
Religious worship                  32361
Name: primary_use, dtype: int64

In [24]:
#encode primary_use
le = preprocessing.LabelEncoder()
train['primary_use'] = le.fit_transform(train['primary_use'])
test['primary_use'] = le.fit_transform(test['primary_use'])

In [25]:
test.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,is_holiday,month,day,weekday,hour
0,0,0,0,2017-01-01,0,0,8.91355,10.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
1,1,1,0,2017-01-01,0,0,7.908387,14.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
2,2,2,0,2017-01-01,0,0,8.5897,27.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
3,3,3,0,2017-01-01,0,0,10.072597,16.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
4,4,4,0,2017-01-01,0,0,11.666565,43.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0


In [26]:
test.dtypes

row_id                        int32
building_id                   int16
meter                          int8
timestamp            datetime64[ns]
site_id                        int8
primary_use                   int64
square_feet                 float64
year_built                  float16
floor_count                 float16
air_temperature             float16
cloud_coverage              float16
dew_temperature             float16
precip_depth_1_hr           float16
is_holiday                    int64
month                         int64
day                           int64
weekday                       int64
hour                          int64
dtype: object

### Drop data

In [27]:
idx_to_drop = list((train[(train['site_id'] == 0) & (train['timestamp'] < "2016-05-21 00:00:00")]).index)
print (len(idx_to_drop))
train.drop(idx_to_drop,axis='rows',inplace=True)

392857


In [28]:
#drop timestamp column
train.drop(columns=["timestamp"], inplace=True)
test.drop(columns=["timestamp"], inplace=True)

In [29]:
# dropping all the electricity meter readings that are 0, after considering them as anomalies.
idx_to_drop = list(train[(train['meter'] == 0) & (train['meter_reading'] == 0)].index)
print(len(idx_to_drop))
train.drop(idx_to_drop,axis='rows',inplace=True)

185592


## Split Train and Test by Meter Types

In [30]:
test['meter'].value_counts()

0    24755760
1     8724960
2     5676480
3     2540400
Name: meter, dtype: int64

In [31]:
#type 0 electricity
train_0 = train[train['meter'] == 0]
test_0 = test[test['meter'] == 0]

In [32]:
train_0.drop(['meter'], axis=1, inplace=True)
test_0.drop(['meter'], axis=1, inplace=True)

In [33]:
#type 1 chilledwater
train_1 = train[train['meter'] == 1]
test_1 = test[test['meter'] == 1]

In [34]:
train_1.drop(['meter'], axis=1, inplace=True)
test_1.drop(['meter'], axis=1, inplace=True)

In [35]:
#type 2 steam
train_2 = train[train['meter'] == 2]
test_2 = test[test['meter'] == 2]

In [36]:
train_2.drop(['meter'], axis=1, inplace=True)
test_2.drop(['meter'], axis=1, inplace=True)

In [37]:
#type 3 hotwater
train_3 = train[train['meter'] == 3]
test_3 = test[test['meter'] == 3]

In [38]:
train_3.drop(['meter'], axis=1, inplace=True)
test_3.drop(['meter'], axis=1, inplace=True)

In [39]:
del train

# Meter Type 0 Lightgbm Half and Half

In [40]:
#splitting x and y
train_0_y = train_0["meter_reading"]
train_0_X = train_0.drop(columns=["meter_reading"], axis=1)

In [41]:
%%time
X_train, X_test, y_train, y_test = train_test_split(train_0_X, train_0_y, train_size = 0.8, random_state=1080)

CPU times: user 7.09 s, sys: 705 ms, total: 7.79 s
Wall time: 4.33 s


In [42]:
y_train = np.log1p(y_train)

In [43]:
X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]

categorical_features = ["building_id", "site_id", "primary_use", "month", "day", "hour", "weekday", "is_holiday"]

d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

### Grid Search Code for Model parameters

In [None]:
# for leaves in range(20, 60, 5):
#    for learning_rate in np.arange(0.01, 0.2, 0.03):
#        for feature_frac in np.arange(0.7, 1.05, 0.05):
#            for reg_lambda in np.arange(0, 5, 1):
#                params = {
#                    "objective": "regression",
#                    "boosting": "gbdt",
#                    "num_leaves": leaves,
#                    "learning_rate": learning_rate,
#                    "feature_fraction": feature_frac,
#                    "reg_lambda": reg_lambda,
#                    "metric": "rmse"
#                }



#                cv_dataset = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)
            
#                cv_model = lgb.train(params, train_set=cv_dataset, num_boost_round=200, valid_sets=watchlist_1, verbose_eval=200)

#                pred = np.expm1(cv_model.predict(X_test, num_iteration=cv_model.best_iteration))

#                pred = pd.DataFrame(pred, columns=["meter_reading"])
#                pred[pred.meter_reading < 0] = 0

#                score = mean_squared_log_error(y_test, pred)

#                print("num_leaves:", leaves, "\tlearning_rate:", learning_rate, "\tfeature_fraction:", feature_frac, "\treg_lambda:", reg_lambda, "\tSCORE:", score)

### Model Training

In [47]:
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

Building model with first half and validating on second half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.309316	valid_1's rmse: 0.310415
[400]	training's rmse: 0.277811	valid_1's rmse: 0.279643
[600]	training's rmse: 0.262691	valid_1's rmse: 0.265217
[800]	training's rmse: 0.25161	valid_1's rmse: 0.254729
[1000]	training's rmse: 0.243188	valid_1's rmse: 0.246865
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.243188	valid_1's rmse: 0.246865
Building model with second half and validating on first half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.310644	valid_1's rmse: 0.311691
[400]	training's rmse: 0.278703	valid_1's rmse: 0.280632
[600]	training's rmse: 0.263604	valid_1's rmse: 0.266082
[800]	training's rmse: 0.252702	valid_1's rmse: 0.255705
[1000]	training's rmse: 0.244835	valid_1's rmse: 0.248398
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.2

In [48]:
pred = np.expm1(model_half_1.predict(X_test, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred += np.expm1(model_half_2.predict(X_test, num_iteration=model_half_2.best_iteration)) / 2    
gc.collect()

19

In [49]:
pred = pd.DataFrame(pred, columns=["meter_reading"])
pred[pred.meter_reading < 0] = 0
score = math.sqrt(mean_squared_log_error(y_test, pred))
score

0.24645591646651063

In [50]:
del pred

### Meter 0 Prediction

In [51]:
prediction_0 = pd.DataFrame()
prediction_0['row_id'] = test_0['row_id']
test_0.drop(['row_id'], axis=1, inplace=True)

In [52]:
test_0.head()

Unnamed: 0,building_id,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,is_holiday,month,day,weekday,hour
0,0,0,0,8.91355,10.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
1,1,0,0,7.908387,14.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
2,2,0,0,8.5897,27.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
3,3,0,0,10.072597,16.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
4,4,0,0,11.666565,43.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0


In [53]:
#prediction
pred_0 = np.expm1(model_half_1.predict(test_0, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred_0 += np.expm1(model_half_2.predict(test_0, num_iteration=model_half_2.best_iteration)) / 2    
del model_half_1
del model_half_2
gc.collect()

19

In [54]:
prediction_0['meter_reading'] = pred_0

In [55]:
prediction_0[prediction_0.meter_reading < 0] = 0

In [56]:
prediction_0.head()

Unnamed: 0,row_id,meter_reading
0,0,175.175186
1,1,71.303906
2,2,4.027667
3,3,302.862581
4,4,1331.741491


In [57]:
prediction_0.describe()

Unnamed: 0,row_id,meter_reading
count,24755760.0,24755760.0
mean,18174170.0,179.2052
std,11525290.0,376.8115
min,0.0,0.0
25%,9501764.0,23.53495
50%,15698270.0,69.29072
75%,27035860.0,178.5391
max,41697600.0,16552.71


In [58]:
test_0 = test[test['meter'] == 0]
# Homogenize the index values
test_0.index = prediction_0.index
# Assign the columns
prediction_0[['row_id']] = test_0[['row_id']]

In [59]:
#check if row_id rewrited sucessfully
prediction_0.loc[prediction_0['row_id'] == 0]

Unnamed: 0,row_id,meter_reading
0,0,175.175186


In [60]:
del X_train, X_test, y_train, y_test

# Meter Type 1 Chilled Water Lightgbm Half and Half

In [61]:
#splitting x and y
train_1_y = train_1["meter_reading"]
train_1_X = train_1.drop(columns=["meter_reading"], axis=1)

In [62]:
%%time
X_train, X_test, y_train, y_test = train_test_split(train_1_X, train_1_y, train_size = 0.8, random_state=1080)

CPU times: user 3.13 s, sys: 306 ms, total: 3.43 s
Wall time: 1.23 s


In [63]:
y_train = np.log1p(y_train)

In [66]:
X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]

categorical_features = ["building_id", "site_id", "primary_use", "month", "day", "hour", "weekday", "is_holiday"]

d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.1,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

Building model with first half and validating on second half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.874336	valid_1's rmse: 0.882527
[400]	training's rmse: 0.7808	valid_1's rmse: 0.796974
[600]	training's rmse: 0.733235	valid_1's rmse: 0.755743
[800]	training's rmse: 0.700853	valid_1's rmse: 0.728824
[1000]	training's rmse: 0.675029	valid_1's rmse: 0.708185
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.675029	valid_1's rmse: 0.708185
Building model with second half and validating on first half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.870305	valid_1's rmse: 0.882744
[400]	training's rmse: 0.78817	valid_1's rmse: 0.806665
[600]	training's rmse: 0.739079	valid_1's rmse: 0.763187
[800]	training's rmse: 0.705446	valid_1's rmse: 0.734829
[1000]	training's rmse: 0.677029	valid_1's rmse: 0.711255
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.677

In [67]:
pred = np.expm1(model_half_1.predict(X_test, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred += np.expm1(model_half_2.predict(X_test, num_iteration=model_half_2.best_iteration)) / 2    
gc.collect()

19

In [68]:
pred = pd.DataFrame(pred, columns=["meter_reading"])
pred[pred.meter_reading < 0] = 0
score = math.sqrt(mean_squared_log_error(y_test, pred))
score

0.7015363734884307

In [70]:
del pred, score

### Meter 1 Prediction

In [71]:
prediction_1 = pd.DataFrame()
prediction_1['row_id'] = test_1['row_id']
test_1.drop(['row_id'], axis=1, inplace=True)

In [72]:
test_1.head()

Unnamed: 0,building_id,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,is_holiday,month,day,weekday,hour
8,7,0,0,11.704157,29.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
11,9,0,6,10.203592,8.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
16,13,0,0,11.506706,18.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
18,14,0,0,11.365005,5.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0
20,15,0,6,11.33806,44.0,3.0,17.796875,4.0,11.703125,0.0,1,1,1,6,0


In [73]:
#prediction
pred_1 = np.expm1(model_half_1.predict(test_1, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred_1 += np.expm1(model_half_2.predict(test_1, num_iteration=model_half_2.best_iteration)) / 2    
del model_half_1
del model_half_2
gc.collect()

19

In [74]:
prediction_1['meter_reading'] = pred_1

In [75]:
prediction_1.head()

Unnamed: 0,row_id,meter_reading
8,8,38.812801
11,11,1.864254
16,16,47.247488
18,18,382.18159
20,20,288.70018


In [76]:
prediction_1[prediction_1.meter_reading < 0] = 0

In [77]:
prediction_1.describe()

Unnamed: 0,row_id,meter_reading
count,8724960.0,8724960.0
mean,22094860.0,459.6394
std,13105030.0,2758.73
min,0.0,0.0
25%,7360804.0,11.28074
50%,23818580.0,96.57898
75%,33546690.0,394.4574
max,41697550.0,1150862.0


In [78]:
test_1 = test[test['meter'] == 1]
# Homogenize the index values,
test_1.index = prediction_1.index
# Assign the columns.
prediction_1[['row_id']] = test_1[['row_id']]

In [79]:
prediction_1.loc[prediction_1['row_id'] == 0]

Unnamed: 0,row_id,meter_reading


In [80]:
#prediction_1.to_csv("0.1_Meter1.csv", index = False)

# Meter Type 2 Steam Lightgbm Half and Half

In [81]:
#splitting x and y
train_2_y = train_2["meter_reading"]
train_2_X = train_2.drop(columns=["meter_reading"], axis=1)

In [82]:
%%time
X_train, X_test, y_train, y_test = train_test_split(train_2_X, train_2_y, train_size = 0.8, random_state=1080)

CPU times: user 1.64 s, sys: 151 ms, total: 1.79 s
Wall time: 660 ms


In [83]:
y_train = np.log1p(y_train)

In [84]:
X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]

categorical_features = ["building_id", "site_id", "primary_use", "month", "day", "hour", "weekday", "is_holiday"]

d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.1,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

Building model with first half and validating on second half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 1.00157	valid_1's rmse: 1.01302
[400]	training's rmse: 0.91721	valid_1's rmse: 0.939335
[600]	training's rmse: 0.874869	valid_1's rmse: 0.906831
[800]	training's rmse: 0.84547	valid_1's rmse: 0.887166
[1000]	training's rmse: 0.822416	valid_1's rmse: 0.873845
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.822416	valid_1's rmse: 0.873845
Building model with second half and validating on first half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.986143	valid_1's rmse: 1.00088
[400]	training's rmse: 0.90832	valid_1's rmse: 0.932974
[600]	training's rmse: 0.867817	valid_1's rmse: 0.902395
[800]	training's rmse: 0.839696	valid_1's rmse: 0.883635
[1000]	training's rmse: 0.818047	valid_1's rmse: 0.871401
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.818047

In [85]:
pred = np.expm1(model_half_1.predict(X_test, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred += np.expm1(model_half_2.predict(X_test, num_iteration=model_half_2.best_iteration)) / 2    
#del model_half_2
gc.collect()

19

In [86]:
pred = pd.DataFrame(pred, columns=["meter_reading"])
pred[pred.meter_reading < 0] = 0
#pred = pred.round(4)
score = math.sqrt(mean_squared_log_error(y_test, pred))
score

0.8617608266001348

In [87]:
del pred, score

### Meter 2 Prediction

In [88]:
prediction_2 = pd.DataFrame()
prediction_2['row_id'] = test_2['row_id']
test_2.drop(['row_id'], axis=1, inplace=True)

In [89]:
#prediction
pred_2 = np.expm1(model_half_1.predict(test_2, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred_2 += np.expm1(model_half_2.predict(test_2, num_iteration=model_half_2.best_iteration)) / 2    
del model_half_1
del model_half_2
gc.collect()

19

In [90]:
prediction_2['meter_reading'] = pred_2

In [91]:
prediction_2[prediction_2.meter_reading < 0] = 0

In [93]:
prediction_2.describe()

Unnamed: 0,row_id,meter_reading
count,5676480.0,5676480.0
mean,28606630.0,8158.021
std,8405139.0,269931.7
min,0.0,0.0
25%,22297280.0,42.47617
50%,29015550.0,234.4737
75%,35858270.0,979.026
max,41697590.0,31794320.0


In [94]:
test_2 = test[test['meter'] == 2]
# Homogenize the index values,
test_2.index = prediction_2.index
# Assign the columns.
prediction_2[['row_id']] = test_2[['row_id']]

In [95]:
prediction_2.head()

Unnamed: 0,row_id,meter_reading
16340505,16340505,0.644605
16340509,16340509,238.554437
16340515,16340515,3994.642147
16340517,16340517,0.0
16340521,16340521,271.891681


In [96]:
#prediction_2.to_csv("0.1_Meter2.csv", index = False)

In [97]:
del X_train, X_test, y_train, y_test

# Meter Type 3 Hot Water Lightgbm Half and Half

In [98]:
#splitting x and y
train_3_y = train_3["meter_reading"]
train_3_X = train_3.drop(columns=["meter_reading"], axis=1)

In [99]:
%%time
X_train, X_test, y_train, y_test = train_test_split(train_3_X, train_3_y, train_size = 0.8, random_state=2080)

CPU times: user 651 ms, sys: 69.7 ms, total: 721 ms
Wall time: 271 ms


In [100]:
y_train = np.log1p(y_train)

In [113]:
X_half_1 = X_train[:int(X_train.shape[0] / 2)]
X_half_2 = X_train[int(X_train.shape[0] / 2):]

y_half_1 = y_train[:int(X_train.shape[0] / 2)]
y_half_2 = y_train[int(X_train.shape[0] / 2):]

categorical_features = ["building_id", "site_id", "primary_use", "month", "day", "hour", "weekday", "is_holiday"]

d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categorical_features, free_raw_data=False)
d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categorical_features, free_raw_data=False)

watchlist_1 = [d_half_1, d_half_2]
watchlist_2 = [d_half_2, d_half_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.15,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

print("Building model with first half and validating on second half:")
model_half_1 = lgb.train(params, train_set=d_half_1, num_boost_round=1000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)

print("Building model with second half and validating on first half:")
model_half_2 = lgb.train(params, train_set=d_half_2, num_boost_round=1000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)

Building model with first half and validating on second half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 1.03185	valid_1's rmse: 1.07295
[400]	training's rmse: 0.951033	valid_1's rmse: 1.02401
[600]	training's rmse: 0.902047	valid_1's rmse: 1.00402
[800]	training's rmse: 0.8623	valid_1's rmse: 0.990702
[1000]	training's rmse: 0.828353	valid_1's rmse: 0.980716
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.828353	valid_1's rmse: 0.980716
Building model with second half and validating on first half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 1.0377	valid_1's rmse: 1.07306
[400]	training's rmse: 0.954463	valid_1's rmse: 1.02369
[600]	training's rmse: 0.905926	valid_1's rmse: 1.00549
[800]	training's rmse: 0.866369	valid_1's rmse: 0.992555
[1000]	training's rmse: 0.831893	valid_1's rmse: 0.983411
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.831893	vali

In [114]:
pred = np.expm1(model_half_1.predict(X_test, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred += np.expm1(model_half_2.predict(X_test, num_iteration=model_half_2.best_iteration)) / 2    
#del model_half_2
gc.collect()

19

In [115]:
pred = pd.DataFrame(pred, columns=["meter_reading"])
pred[pred.meter_reading < 0] = 0
#pred = pred.round(4)
score = math.sqrt(mean_squared_log_error(y_test, pred))
score

0.9597877250291816

### Meter 3 Prediction

In [116]:
prediction_3 = pd.DataFrame()
prediction_3['row_id'] = test_3['row_id']
test_3.drop(['row_id'], axis=1, inplace=True)

In [117]:
#prediction
pred_3 = np.expm1(model_half_1.predict(test_3, num_iteration=model_half_1.best_iteration)) / 2
gc.collect()
pred_3 += np.expm1(model_half_2.predict(test_3, num_iteration=model_half_2.best_iteration)) / 2    
del model_half_1
del model_half_2
gc.collect()

19

In [118]:
prediction_3['meter_reading'] = pred_3

In [119]:
prediction_3[prediction_3.meter_reading < 0] = 0

In [120]:
test_3 = test[test['meter'] == 3]
# Homogenize the index values,
test_3.index = prediction_3.index
# Assign the columns.
prediction_3[['row_id']] = test_3[['row_id']]

In [121]:
prediction_3.describe()

Unnamed: 0,row_id,meter_reading
count,2540400.0,2540400.0
mean,19940510.0,266.8934
std,14028460.0,1180.302
min,2260082.0,0.0
25%,5596286.0,2.230265
50%,25237190.0,33.39284
75%,34222440.0,191.8831
max,41697370.0,444355.4


In [122]:
prediction_3.head()

Unnamed: 0,row_id,meter_reading
2260082,2260082,1.71179
2260086,2260086,3.941176
2260090,2260090,18.673329
2260092,2260092,46.319654
2260094,2260094,405.175753


In [123]:
prediction_3.loc[prediction_3['row_id'] == 0]

Unnamed: 0,row_id,meter_reading


In [124]:
#prediction_3.to_csv("0.15_Meter3.csv", index = False)

In [125]:
del pred

In [126]:
del X_train, X_test, y_train, y_test

## Merging Tables

In [127]:
frames = [prediction_0, prediction_1, prediction_2, prediction_3]
prediction = pd.concat(frames)

In [132]:
prediction = prediction.sort_values('row_id')

In [133]:
prediction.reset_index(inplace=True, drop=True)

In [134]:
prediction.shape

(41697600, 2)

In [135]:
prediction.head(20)

Unnamed: 0,row_id,meter_reading
0,0,175.175186
1,1,71.303906
2,2,4.027667
3,3,302.862581
4,4,1331.741491
5,5,10.769713
6,6,127.821117
7,7,453.821322
8,8,38.812801
9,9,415.420948


In [136]:
any(prediction['row_id'].duplicated())

False

In [137]:
prediction.to_csv("final_submission.csv", index = False)