In [40]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING

In [2]:
root_dir = os.path.dirname(os.path.abspath(os.getcwd()))
pd.set_option("display.max_rows", 10, "display.max_columns", None)
plt.rcParams["figure.figsize"] = (20, 12)
sns.set_style("darkgrid")

## Helper Functions

In [3]:
def sr_encoder(dataframe, colname):
    """
    encodes the label using Supervised ratio technique.

    inputs: dataframe and column_name
    output: dataframe with encoded predictor
    """

    label_list = list(dataframe[colname].unique())       # list of all the labels in the predictors
    label_list.sort()                                    # sorting the list

    supervised_ratio = []                                # initializing list and datatype
    label_dict = {}

    for i in label_list:
        event = len(dataframe[dataframe[colname] == i])  # filtering out the colname
        total = len(dataframe)
        SR = round((event/total)*1e3, 3)                 # supervised_ratio
        label_dict[i] = SR                               # forming the dictionary

    dataframe[colname] = dataframe[colname].map(label_dict)

    return dataframe, label_dict

def save_json(dictionary, json_path):
    """
    saves the dictionary for future use. Possibly to encode test values

    input: dict, json file name with extension
    returns nothing
    """

    json_obj = {}
    for k, v in dictionary.items():
        key = str(k)
        json_obj[key] = v

    with open(json_path, "w") as json_file:
        json.dump(json_obj, json_file)

In [4]:
def datetime_engg(dataframe):
    """
    creates new features from datetime column
    """
    dataframe["hour"] = dataframe["timestamp"].dt.hour
    dataframe["day_of_week"] = dataframe["timestamp"].dt.dayofweek
    dataframe["month"] = dataframe["timestamp"].dt.month
    
    return dataframe

In [5]:
train_path = os.path.join(root_dir, "data", "interim_data", "train_agg.csv")
train_df = pd.read_csv(train_path)
train_df.head(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


# 2. Data Preparation
### 2.1. converting the datatype and filling in Nulls

In [6]:
# converting timestamp to datetime format
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"])

# filling in nulls with -999
train_df = train_df.fillna(-999)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 16 columns):
building_id           int64
meter                 int64
timestamp             datetime64[ns]
meter_reading         float64
site_id               int64
primary_use           object
square_feet           int64
year_built            float64
floor_count           float64
air_temperature       float64
cloud_coverage        float64
dew_temperature       float64
precip_depth_1_hr     float64
sea_level_pressure    float64
wind_direction        float64
wind_speed            float64
dtypes: datetime64[ns](1), float64(10), int64(4), object(1)
memory usage: 2.4+ GB


### 2.2. Encoding the categorical variables
In the dataset, following are the variables that are categorical in nature:-
- `building_id`
- `site_id`
- `primary_use`
- `year_built`

These variables are to be encoded with certain techniques so that we can correctly input the information to the model, reducing the noise.

In [8]:
train_df, puse_dict = sr_encoder(train_df, colname = "primary_use")          # encoding the primary using supervised ratio
train_df, site_dict = sr_encoder(train_df, colname = "site_id")              # encoding the site_id using supervised ratio

# saving the json
save_json(puse_dict, os.path.join(root_dir, "models", "primary_use.json"))
save_json(site_dict, os.path.join(root_dir, "models", "site_id.json"))

In [9]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,53.258,403.911,7432,2008.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0
1,1,0,2016-01-01,0.0,53.258,403.911,2720,2004.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0
2,2,0,2016-01-01,0.0,53.258,403.911,5376,1991.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0
3,3,0,2016-01-01,0.0,53.258,403.911,23685,2002.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0
4,4,0,2016-01-01,0.0,53.258,403.911,116607,1975.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0


### 2.3. Engineering the Datatime feature

In [10]:
train_df = datetime_engg(train_df)
train_df = train_df.drop(columns = ["timestamp"])

In [11]:
train_df.head()

Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,day_of_week,month
0,0,0,0.0,53.258,403.911,7432,2008.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0,0,4,1
1,1,0,0.0,53.258,403.911,2720,2004.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0,0,4,1
2,2,0,0.0,53.258,403.911,5376,1991.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0,0,4,1
3,3,0,0.0,53.258,403.911,23685,2002.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0,0,4,1
4,4,0,0.0,53.258,403.911,116607,1975.0,-999.0,25.0,6.0,20.0,-999.0,1019.7,0.0,0.0,0,4,1


# 3. Machine Learning Models

## 3.1. XgBoost

In [12]:
x = train_df.drop(columns = ["meter_reading"])
y = train_df["meter_reading"]

del train_df

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, shuffle = False)

del x, y

In [13]:
# define RMSLE
def RMSLE(y_pred, y_true):
    """
    custom eval metric for XGBClassifier
    """
    err = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return "RMSLE", err

In [41]:
def hyperparameter_tuning(params):
    """
    hypertunes a XGBoost model
    
    inp: parameters
    outp: score per fold
    """
    params = {
        "max_depth": int(params["max_depth"]),                              # max depth of the tree
        "gamma": "{:.3f}".format(params["gamma"]),                          # min loss reduction required to make a split
        "subsample": "{:.2f}".format(params["subsample"]),                  # denotes the fraction of observations to be randomly samples for each tree
        "reg_alpha": "{:.3f}".format(params["reg_alpha"]),                  # L1 regularization weight
        "reg_lambda": "{:.3f}".format(params["reg_lambda"]),                # L2 regularization weight
        "learning_rate": "{:.3f}".format(params["learning_rate"]),          # learning rate of XGB
        "num_leaves": "{:.3f}".format(params["num_leaves"]),                 
        "colsample_bytree": "{:.3f}".format(params["colsample_bytree"]),
        "min_child_samples": "{:.3f}".format(params["min_child_samples"]),
        "feature_fraction": "{:.3f}".format(params["feature_fraction"]),
        "bagging_fraction": "{:.3f}".format(params["bagging_fraction"]),
        "objective": "reg:squarederror",
        "RMSLE": RMSLE,
        #"eval_metric": "error"
    }
    
    print("#"*25)
    print("Params = {}".format(params))
    FOLDS = 10  # defining the folds required
    count = 1   # count of HPT cycles
    kf = KFold(n_splits = FOLDS, shuffle = False, random_state = 42)
    y_oof = np.zeros(x_train.shape[0])
    MSLE_mean = 0
    for trn_idx, val_idx in kf.split(x_train):
        # for purpose of saving memory, reducing the size of lists
        trn_idx = list(filter(lambda x: (x % 10 == 0), trn_idx))
        val_idx = list(filter(lambda x: (x % 10 == 0), val_idx))
        
        # define the classifier
        regressor = xgboost.XGBRegressor(random_state = 42, 
                                   verbose = True, 
                                   # tree_method = "gpu_hist",
                                   **params)
        # spliting data into train and valid sets
        train_x, valid_x = x_train.iloc[trn_idx], x_train.iloc[val_idx]
        train_y, valid_y = y_train.iloc[trn_idx], y_train.iloc[val_idx]
        
        # fit the estimator and predict
        regressor.fit(train_x, train_y)
        pred = regressor.predict(valid_x)
        pred = [0 for i in pred if i < 0]
        # eval metrics
        score_MSLE = make_scorer(mean_squared_error)(regressor, valid_x, valid_y)
        MSLE_mean += score_MSLE
        print("Count = {} ... score_MSLE = {:.4f}".format(count, score_MSLE))
        count += 1
    
    gc.collect()
    print("Mean MSLE = {:.4}".format(MSLE_mean / FOLDS))
    del train_x, valid_x, train_y, valid_y, regressor, score_MSLE
    
    return -(MLSE_mean/FOLDS)

In [42]:
space = {
    "max_depth": hp.quniform("max_depth", 6, 8, 1),
    "reg_alpha": hp.uniform("reg_alpha", 0.01, 0.05),
    "reg_lambda": hp.uniform("reg_lambda", 0.01, 0.05),
    "learning_rate": hp.uniform("learning_rate", 0.001, 0.2),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.3, 0.9),
    "gamma": hp.uniform("gamma", 0.01, 0.7),
    "num_leaves": hp.choice("num_leaves", list(range(20, 250, 10))),
    "min_child_samples": hp.choice("min_child_samples", list(range(100, 250, 10))),
    "subsample": hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
}

In [None]:
%%time

# Set algoritm parameters
best = fmin(fn = hyperparameter_tuning,
            space = space,
            algo = tpe.suggest,
            max_evals = 5)

# Print best parameters
best_params = space_eval(space, best)

#########################                          
Params = {'max_depth': 8, 'gamma': '0.692', 'subsample': '0.70', 'reg_alpha': '0.037', 'reg_lambda': '0.032', 'learning_rate': '0.061', 'num_leaves': '130.000', 'colsample_bytree': '0.610', 'min_child_samples': '210.000', 'feature_fraction': '0.448', 'bagging_fraction': '0.656', 'objective': 'reg:squarederror', 'RMSLE': <function RMSLE at 0x7f7f246597b8>}
Count = 1 ... score_MSLE = 1553646281.2323         
  0%|          | 0/5 [03:18<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \



Count = 2 ... score_MSLE = 7132009579.7845         
  0%|          | 0/5 [06:35<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \



Count = 3 ... score_MSLE = 24263455569.9251        
  0%|          | 0/5 [09:44<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \



Count = 4 ... score_MSLE = 16045832604.2610        
  0%|          | 0/5 [12:55<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \



Count = 5 ... score_MSLE = 9336915786.5508         
  0%|          | 0/5 [16:03<?, ?it/s, best loss: ?]

  if getattr(data, 'base', None) is not None and \

