In [1]:
import numpy as np
import pandas as pd
import warnings
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
color = sns.color_palette()
sns.set_style("darkgrid")
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import random
from sklearn.metrics import mean_squared_error
random.seed(12)
import pickle

In [3]:
def prepare_data(X, test=False):
    """
    Preparing final dataset with all features.
    """
    X.square_feet = np.log1p(X.square_feet)
    
    if not test:
        X.sort_values("timestamp", inplace=True)
        X.reset_index(drop=True, inplace=True)
    
    gc.collect()
    
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                "2017-01-01", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                "2019-01-01"]
    
    X["hour"] = X.timestamp.dt.hour
    X["weekday"] = X.timestamp.dt.weekday
    X["is_holiday"] = (X.timestamp.dt.date.astype("str").isin(holidays)).astype(int)
    
    drop_features = ["timestamp", "sea_level_pressure", "wind_direction", "wind_speed"]
    X.drop(drop_features, axis=1, inplace=True)
    
    ####imputing NaNs
    X['year_built'].fillna(-1, inplace=True)
    X['floor_count'].fillna(0, inplace=True)
    X['air_temperature'].fillna(0, inplace=True)
    X['cloud_coverage'].fillna(-1, inplace=True)
    X['dew_temperature'].fillna(0, inplace=True)
    X['precip_depth_1_hr'].fillna(-2, inplace=True)
    
    if test:
        return X
#         row_ids = X.row_id
#         X.drop("row_id", axis=1, inplace=True)
#         return X, row_ids
    else:
        y = np.log1p(X.meter_reading)
        X.drop("meter_reading", axis=1, inplace=True)
        return X, y

In [None]:
df_train = pd.read_pickle('df_train.pkl')

In [None]:
df_train.sample(5)

In [None]:
print(f"Max Air Temp:{np.max(df_train['air_temperature'])}")
print(f"Min Air Temp:{np.min(df_train['air_temperature'])}")
print(f"Max Dew Temp:{np.max(df_train['dew_temperature'])}")
print(f"Min Dew Temp:{np.min(df_train['dew_temperature'])}")
print(f"Max Precipitation Depth:{np.max(df_train['precip_depth_1_hr'])}")
print(f"Min Precipitation Depth:{np.min(df_train['precip_depth_1_hr'])}")

In [None]:
df_train.dtypes

In [None]:
df_train.isna().sum()

In [None]:
le = LabelEncoder()
df_train['primary_use'] = le.fit_transform(df_train['primary_use'])

In [None]:
df_train_0 = df_train[df_train['meter']==0]
df_train_1 = df_train[df_train['meter']==1]
df_train_2 = df_train[df_train['meter']==2]
df_train_3 = df_train[df_train['meter']==3]

del df_train
gc.collect()

In [None]:
X_0, y_0 = prepare_data(df_train_0)
X_1, y_1 = prepare_data(df_train_1)
X_2, y_2 = prepare_data(df_train_2)
X_3, y_3 = prepare_data(df_train_3)

In [None]:
X_0.head(10)

In [None]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "hour", "weekday"]

d_0 = lgb.Dataset(X_0, label=y_0, categorical_feature=categorical_features, free_raw_data=False)
d_1 = lgb.Dataset(X_1, label=y_1, categorical_feature=categorical_features, free_raw_data=False)
d_2 = lgb.Dataset(X_2, label=y_2, categorical_feature=categorical_features, free_raw_data=False)
d_3 = lgb.Dataset(X_3, label=y_3, categorical_feature=categorical_features, free_raw_data=False)

In [None]:
watchlist_0 = [d_0]
watchlist_1 = [d_1]
watchlist_2 = [d_2]
watchlist_3 = [d_3]

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 40,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse"
}

In [None]:
print("Building LGBM Model with for meter 0: ")
model_0 = lgb.train(params, train_set=d_0, 
                         num_boost_round=1000, valid_sets=watchlist_0, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 
print("Building LGBM Model with for meter 1: ")
model_1 = lgb.train(params, train_set=d_1, 
                         num_boost_round=1000, valid_sets=watchlist_1, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 
print("Building LGBM Model with for meter 2: ")
model_2 = lgb.train(params, train_set=d_2, 
                         num_boost_round=1000, valid_sets=watchlist_2, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 
print("Building LGBM Model with for meter 3: ")
model_3 = lgb.train(params, train_set=d_3, 
                         num_boost_round=1000, valid_sets=watchlist_3, 
                         verbose_eval=200, early_stopping_rounds=200)
print('#'*50) 


In [None]:
models = {'model_0' : model_0,
          'model_1' : model_1,
          'model_2' : model_2,
          'model_3' : model_3}

In [None]:
for i in range(4):
    picfile = open(f'model_{i}', 'wb')
    pickle.dump(models[f'model_{i}'], picfile)
    picfile.close() 
    
picfile = open('le', 'wb') 
pickle.dump(le, picfile)
picfile.close() 

In [None]:
del X_0, y_0, X_1, y_1, X_2, y_2, X_3, y_3
gc.collect()

In [4]:
df_test = pd.read_pickle('df_test.pkl')

In [5]:
models = {}
for i in range(4):
    picfile = open(f'model_{i}', 'rb')
    models[f'model_{i}'] = pickle.load(picfile)
    picfile.close() 
        
picfile = open('le', 'rb') 
le = pickle.load(picfile)
picfile.close() 

In [6]:
df_test.primary_use = le.transform(df_test.primary_use)

In [7]:
# X_test, row_ids = prepare_data(df_test, test=True)
X_test = prepare_data(df_test, test=True)
del df_test
gc.collect()

33

In [8]:
for i in range(4):
    temp = X_test[X_test['meter'] == i]
    row = temp['row_id']
    temp.drop('row_id', axis=1, inplace=True)
    pred = np.expm1(models[f'model_{i}'].predict(temp, num_iteration=models[f'model_{i}'].best_iteration))
    gc.collect()
    pd.DataFrame({"row_id": row, "meter_reading": np.clip(pred, 0, a_max=None)}).to_csv(f"submission{i}.csv", index=False)
    del temp,  row, pred
    gc.collect()

In [9]:
del X_test
gc.collect()

22

In [10]:
sub0 = pd.read_csv('submission0.csv')
sub1 = pd.read_csv('submission1.csv')
sub2 = pd.read_csv('submission2.csv')
sub3 = pd.read_csv('submission3.csv')

In [15]:
frames = [sub0, sub1, sub2, sub3]

In [16]:
result = pd.concat(frames)
del sub0, sub1, sub2, sub3
gc.collect()

0

In [17]:
result.shape

(41697600, 2)

In [18]:
result.to_csv('submission.csv', index=False)