In [1]:
import numpy as np
import pandas as pd
import warnings
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
color = sns.color_palette()
sns.set_style("darkgrid")
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_pickle('df_train.pkl')

In [3]:
df_train.sample(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
3794905,963,1,2016-03-12 17:00:00,253.177994,9,Lodging/residential,44784,,,18.90625,4.0,14.398438,0.0,1014.0,,1.5
11259961,355,0,2016-07-25 05:00:00,266.25,3,Education,182500,1970.0,,29.40625,4.0,24.40625,0.0,1014.5,160.0,2.099609
11366919,538,0,2016-07-27 03:00:00,6.52,3,Religious worship,10000,2010.0,,29.40625,8.0,21.703125,0.0,1013.5,220.0,3.599609
9579613,261,0,2016-06-25 08:00:00,199.589996,2,Education,89770,,,33.90625,2.0,8.296875,0.0,1005.5,100.0,3.599609
488237,914,1,2016-01-09 20:00:00,267.351013,9,Education,229973,,,11.101562,,1.099609,0.0,1017.0,320.0,5.699219


In [4]:
print(f"Max Air Temp:{np.max(df_train['air_temperature'])}")
print(f"Min Air Temp:{np.min(df_train['air_temperature'])}")
print(f"Max Dew Temp:{np.max(df_train['dew_temperature'])}")
print(f"Min Dew Temp:{np.min(df_train['dew_temperature'])}")
print(f"Max Precipitation Depth:{np.max(df_train['precip_depth_1_hr'])}")
print(f"Min Precipitation Depth:{np.min(df_train['precip_depth_1_hr'])}")

Max Air Temp:47.1875
Min Air Temp:-28.90625
Max Dew Temp:26.09375
Min Dew Temp:-35.0
Max Precipitation Depth:343.0
Min Precipitation Depth:-1.0


In [5]:
df_train.dtypes

building_id                    int16
meter                           int8
timestamp             datetime64[ns]
meter_reading                float32
site_id                         int8
primary_use                   object
square_feet                    int32
year_built                   float16
floor_count                  float16
air_temperature              float16
cloud_coverage               float16
dew_temperature              float16
precip_depth_1_hr            float16
sea_level_pressure           float16
wind_direction               float16
wind_speed                   float16
dtype: object

In [6]:
df_train.isna().sum()

building_id                  0
meter                        0
timestamp                    0
meter_reading                0
site_id                      0
primary_use                  0
square_feet                  0
year_built            12127645
floor_count           16709167
air_temperature          96658
cloud_coverage         8825365
dew_temperature         100140
precip_depth_1_hr      3749023
sea_level_pressure     1231669
wind_direction         1449048
wind_speed              143676
dtype: int64

In [2]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()
df_train['primary_use'] = le.fit_transform(df_train['primary_use'])

In [9]:
df_train_0 = df_train[df_train['meter']==0]
df_train_1 = df_train[df_train['meter']==1]
df_train_2 = df_train[df_train['meter']==2]
df_train_3 = df_train[df_train['meter']==3]

del df_train
gc.collect()

11

In [10]:
# print(f"No of Unique Primary use for meter{0} is: {df_train_0['primary_use'].nunique()}")
# print(f"No of Unique Primary use for meter{1} is: {df_train_1['primary_use'].nunique()}")
# print(f"No of Unique Primary use for meter{2} is: {df_train_2['primary_use'].nunique()}")
# print(f"No of Unique Primary use for meter{3} is: {df_train_3['primary_use'].nunique()}")

In [3]:
def prepare_data(X, test=False):
    """
    Preparing final dataset with all features.
    """
    X.square_feet = np.log1p(X.square_feet)
    
    if not test:
        X.sort_values("timestamp", inplace=True)
        X.reset_index(drop=True, inplace=True)
    
    gc.collect()
    
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    X["hour"] = X.timestamp.dt.hour
    X["weekday"] = X.timestamp.dt.weekday
    X["is_holiday"] = (X.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    drop_features = ["timestamp", "sea_level_pressure", "wind_direction", "wind_speed"]
    X.drop(drop_features, axis=1, inplace=True)
    
    ####imputing NaNs
    X['year_built'].fillna(-1, inplace=True)
    X['floor_count'].fillna(0, inplace=True)
    X['air_temperature'].fillna(0, inplace=True)
    X['cloud_coverage'].fillna(-1, inplace=True)
    X['dew_temperature'].fillna(0, inplace=True)
    X['precip_depth_1_hr'].fillna(-2, inplace=True)
    
    if test:
        return X
#         row_ids = X.row_id
#         X.drop("row_id", axis=1, inplace=True)
#         return X, row_ids
    else:
        y = np.log1p(X.meter_reading)
        X.drop("meter_reading", axis=1, inplace=True)
        return X, y

In [12]:
X_0, y_0 = prepare_data(df_train_0)
X_1, y_1 = prepare_data(df_train_1)
X_2, y_2 = prepare_data(df_train_2)
X_3, y_3 = prepare_data(df_train_3)

del df_train_0, df_train_1, df_train_2, df_train_3
gc.collect()

34

In [13]:
X_0.head(10)

Unnamed: 0,building_id,meter,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,hour,weekday,is_holiday
0,0,0,0,0,8.913685,2008.0,0.0,25.0,6.0,20.0,-2.0,0,4,1
1,960,0,9,0,10.935568,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
2,959,0,9,0,11.763972,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
3,958,0,9,1,12.366575,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
4,957,0,9,0,11.61905,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
5,956,0,9,6,11.364054,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
6,955,0,9,0,12.198489,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
7,954,0,9,0,12.628804,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
8,953,0,9,1,13.242425,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1
9,952,0,9,0,12.292365,-1.0,0.0,0.0,-1.0,0.0,-2.0,0,4,1


In [14]:
X_0_1 =  X_0[:int(X_0.shape[0] / 2)]
X_0_2 =  X_0[int(X_0.shape[0] / 2):]
X_1_1 =  X_0[:int(X_1.shape[0] / 2)]
X_1_2 =  X_0[int(X_1.shape[0] / 2):]
X_2_1 =  X_0[:int(X_2.shape[0] / 2)]
X_2_2 =  X_0[int(X_2.shape[0] / 2):]
X_3_1 =  X_0[:int(X_3.shape[0] / 2)]
X_3_2 =  X_0[int(X_3.shape[0] / 2):]

y_0_1 = y_0[:int(X_0.shape[0] / 2)]
y_0_2 = y_0[int(X_0.shape[0] / 2):]
y_1_1 = y_0[:int(X_1.shape[0] / 2)]
y_1_2 = y_0[int(X_1.shape[0] / 2):]
y_2_1 = y_0[:int(X_2.shape[0] / 2)]
y_2_2 = y_0[int(X_2.shape[0] / 2):]
y_3_1 = y_0[:int(X_3.shape[0] / 2)]
y_3_2 = y_0[int(X_3.shape[0] / 2):]

del X_0, y_0, X_1, y_1, X_2, y_2, X_3, y_3
gc.collect()

0

In [4]:
import lightgbm as lgb
import random
from sklearn.metrics import mean_squared_error
random.seed(12)

In [16]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "hour", "weekday"]

d_0_1 = lgb.Dataset(X_0_1, label=y_0_1, categorical_feature=categorical_features, free_raw_data=False)
d_0_2 = lgb.Dataset(X_0_2, label=y_0_2, categorical_feature=categorical_features, free_raw_data=False)
d_1_1 = lgb.Dataset(X_1_1, label=y_1_1, categorical_feature=categorical_features, free_raw_data=False)
d_1_2 = lgb.Dataset(X_1_2, label=y_1_2, categorical_feature=categorical_features, free_raw_data=False)
d_2_1 = lgb.Dataset(X_2_1, label=y_2_1, categorical_feature=categorical_features, free_raw_data=False)
d_2_2 = lgb.Dataset(X_2_2, label=y_2_2, categorical_feature=categorical_features, free_raw_data=False)
d_3_1 = lgb.Dataset(X_3_1, label=y_3_1, categorical_feature=categorical_features, free_raw_data=False)
d_3_2 = lgb.Dataset(X_3_2, label=y_3_2, categorical_feature=categorical_features, free_raw_data=False)

In [17]:
watchlist_0_1 = [d_0_1, d_0_2]
watchlist_0_2 = [d_0_2, d_0_1]
watchlist_1_1 = [d_1_1, d_1_2]
watchlist_1_2 = [d_1_2, d_1_1]
watchlist_2_1 = [d_2_1, d_2_2]
watchlist_2_2 = [d_2_2, d_2_1]
watchlist_3_1 = [d_3_1, d_3_2]
watchlist_3_2 = [d_3_2, d_3_1]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

In [18]:
print("Building LGBM Model with for meter 0 with 1st half of data and validation on 2nd half:")
model_0_1 = lgb.train(params, train_set=d_0_1, 
                         num_boost_round=1000, valid_sets=watchlist_0_1, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 

print("Building LGBM Model with for meter 0 with 2nd half of data and validation on 1st half:")
model_0_2 = lgb.train(params, train_set=d_0_2, 
                         num_boost_round=1000, valid_sets=watchlist_0_2, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 

print("Building LGBM Model with for meter 1 with 1st half of data and validation on 2nd half:")
model_1_1 = lgb.train(params, train_set=d_1_1, 
                         num_boost_round=1000, valid_sets=watchlist_1_1, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 

print("Building LGBM Model with for meter 1 with 2nd half of data and validation on 1st half:")
model_1_2 = lgb.train(params, train_set=d_1_2, 
                         num_boost_round=1000, valid_sets=watchlist_1_2, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 

print("Building LGBM Model with for meter 2 with 1st half of data and validation on 2nd half:")
model_2_1 = lgb.train(params, train_set=d_2_1, 
                         num_boost_round=1000, valid_sets=watchlist_2_1, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 

print("Building LGBM Model with for meter 2 with 2nd half of data and validation on 1st half:")
model_2_2 = lgb.train(params, train_set=d_2_2, 
                         num_boost_round=1000, valid_sets=watchlist_2_2, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 

print("Building LGBM Model with for meter 3 with 1st half of data and validation on 2nd half:")
model_3_1 = lgb.train(params, train_set=d_3_1, 
                         num_boost_round=1000, valid_sets=watchlist_3_1, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 

print("Building LGBM Model with for meter 3 with 2nd half of data and validation on 1st half:")
model_3_2 = lgb.train(params, train_set=d_3_2, 
                         num_boost_round=1000, valid_sets=watchlist_3_2, 
                         verbose_eval=200, early_stopping_rounds=200)

Building LGBM Model with for meter 0 with 1st half of data and validation on 2nd half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.545571	valid_1's rmse: 1.18394
Early stopping, best iteration is:
[31]	training's rmse: 0.786673	valid_1's rmse: 1.15263
##################################################
Building LGBM Model with for meter 0 with 2nd half of data and validation on 1st half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.485943	valid_1's rmse: 1.4731
Early stopping, best iteration is:
[32]	training's rmse: 0.695521	valid_1's rmse: 1.43825
##################################################
Building LGBM Model with for meter 1 with 1st half of data and validation on 2nd half:
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 0.388747	valid_1's rmse: 1.46856
Early stopping, best iteration is:
[19]	training's rmse: 0.856259	valid_1's rmse: 1.31837
##############

In [36]:
models = {'model_0_1' : model_0_1,
          'model_0_2' : model_0_2,
          'model_1_1' : model_1_1,
          'model_1_2' : model_1_2,
          'model_2_1' : model_2_1,
          'model_2_2' : model_2_2,
          'model_3_1' : model_3_1,
          'model_3_2' : model_3_2 }

In [5]:
import pickle

In [38]:
for i in range(4):
    for j in range(1,3):
        picfile = open(f'model_{i}_{j}', 'wb') 
        pickle.dump(models[f'model_{i}_{j}'], picfile)
        picfile.close() 

In [41]:
picfile = open('le', 'wb') 
pickle.dump(le, picfile)
picfile.close() 

In [42]:
del X_0_1, X_0_2, X_1_1, X_1_2, X_2_1, X_2_2, X_3_1, X_3_2
del y_0_1, y_0_2, y_1_1, y_1_2, y_2_1, y_2_2, y_3_1, y_3_2
    
gc.collect()

329

In [6]:
df_test = pd.read_pickle('df_test.pkl')

In [7]:
models = {}
for i in range(4):
    for j in range(1,3):
        picfile = open(f'model_{i}_{j}', 'rb') 
        models[f'model_{i}_{j}'] = pickle.load(picfile)
        picfile.close() 
        
picfile = open('le', 'rb') 
le = pickle.load(picfile)
picfile.close() 

In [8]:
df_test.primary_use = le.transform(df_test.primary_use)

In [9]:
# X_test, row_ids = prepare_data(df_test, test=True)
X_test = prepare_data(df_test, test=True)
del df_test
gc.collect()

33

In [10]:
for i in range(4):
    temp = X_test[X_test['meter'] == i]
    row = temp['row_id']
    temp.drop('row_id', axis=1, inplace=True)
    for j in range(1,3):
        if j == 1:
            pred = np.expm1(models[f'model_{i}_{j}'].predict(temp, num_iteration=models[f'model_{i}_{j}'].best_iteration)) / 2
            del models[f'model_{i}_{j}']
            gc.collect()
        else:
            pred += np.expm1(models[f'model_{i}_{j}'].predict(temp, num_iteration=models[f'model_{i}_{j}'].best_iteration)) / 2
            del models[f'model_{i}_{j}']
            gc.collect()
    
    pd.DataFrame({"row_id": row, "meter_reading": np.clip(pred, 0, a_max=None)}).to_csv(f"submission{i}.csv", index=False)
    del temp,  row, pred
    gc.collect()

In [11]:
del X_test
gc.collect()

22

In [12]:
sub0 = pd.read_csv('submission0.csv')
sub1 = pd.read_csv('submission1.csv')
sub2 = pd.read_csv('submission2.csv')
sub3 = pd.read_csv('submission3.csv')

In [14]:
frames = [sub0, sub1, sub2, sub3]

In [15]:
result = pd.concat(frames)
del sub0, sub1, sub2, sub3
gc.collect()

1143

In [16]:
result.shape

(41697600, 2)

In [17]:
result.to_csv('submission.csv', index=False)