# Building Data Genome Project 2.0
## Predictive models

Biam! (pic.biam@gmail.com)

In [1]:
# data and numbers
import numpy as np
import pandas as pd
import math
import datetime
from glob import glob
import gc

# Visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
mpl.rcParams['figure.figsize'] = (12,8)
mpl.rcParams['font.size'] = 12
mpl.rcParams['figure.dpi'] = 80
import seaborn as sns
sns.set_style("darkgrid")
mpl.style.use('ggplot')

# Metrics & Models
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
import lightgbm as lgb

# Introduction

# Datasets

<p><b>Dev (meter reading)</b></p>
<ul>
<li><code>timestamp</code>: date and time in the format YYYY-MM-DD hh:mm:ss. 2016 and 2017 data.</li>
<li><code>building_id</code>: building code-name with the structure <i>UniqueFirstName_SiteID_primaryspaceusage</i>.</li>
<li><code>meter_reading</code>: meter reading in kilowatt hour (kWh) .</li>
<li><code>meter</code>: meter type, <code>chilledwater</code>, <code>electricity</code>, <code>gas</code>, <code>hotwater</code>, <code>irrigation</code>, <code>steam</code> or <code>water</code>.</li>
</ul>
    
<p><b>Buildings metadata</b></p>
<ul>
<li><code>building_id</code>: building code-name with the structure <i>UniqueFirstName_SiteID_primaryspaceusage</i>.</li>
<li><code>site_id</code>: animal-code-name for the site.</li>
<li><code>primaryspaceusage</code>: Primary space usage of all buildings is mapped using the <a href="https://www.energystar.gov/buildings/facility-owners-and-managers/existing-buildings/use-portfolio-manager/identify-your-property-type" rel="nofollow">energystar scheme building description types</a>. </li>
<li><code>sqft</code>: building area in square feet (sq ft). </li>
<li><code>lat</code>: latitude in degrees.</li>
<li><code>lng</code>: longitude in degrees.</li>
 <li><code>electricity</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative.</li>
<li><code>hotwater</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative.</li>
<li><code>chilledwater</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative.</li>
<li><code>steam</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative.</li>
<li><code>water</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative.</li>
<li><code>irrigation</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative.</li>
<li><code>solar</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative.</li>
<li><code>gas</code>: presence of this kind of meter in the building. <code>Yes</code> if affirmative, <code>NaN</code> if negative./li>
<li><code>yearbuilt</code>: year built in the format YYYY.</li>
<li><code>numberoffloors</code>: number of floors.</li>
<li><code>date_opened</code>: date opened in the format D/M/YYYY.</li>
<li><code>sub_primaryspaceusage</code>: <a href="https://www.energystar.gov/buildings/facility-owners-and-managers/existing-buildings/use-portfolio-manager/identify-your-property-type" rel="nofollow">energystar scheme building description types</a> subcategory.</li>
<li><code>energystarscore</code>: <a href="https://www.energystar.gov/buildings/facility-owners-and-managers/existing-buildings/use-portfolio-manager/understand-metrics/how-1-100">Energy Star Score.</a></li>
<li><code>eui</code>: <a href="https://www.energystar.gov/buildings/facility-owners-and-managers/existing-buildings/use-portfolio-manager/understand-metrics/what-energy"> Energy use intensity.</a></li>
<li><code>heatingtype</code>: type of building's heating system.</li>
<li><code>industry</code>: building industry.</li>
<li><code>leed_level</code>: <a href"https://en.wikipedia.org/wiki/Leadership_in_Energy_and_Environmental_Design">Leadership in Energy and Environmental Design.</a></li>
<li><code>occupants</code>: number of ccupants?</li>
<li><code>rating</code>: building rating (system or criteria?)</li>
<li><code>site_eui</code>: site energy use intensity.</li>
<li><code>source_eui</code>: ?</li>
<li><code>sqm</code>: buildinga area in square meters?</li>
<li><code>subindustry</code>: building subindustry.</li>
<li><code>timezone</code>: time zone.</li>
</ul>
    
<p><b>Weather</b></p>
<ul>
<li><code>timestamp</code>: date and time in the format YYYY-MM-DD hh:mm:ss.</li>
<li><code>site_id</code>: animal-code-name for the site.</li>
<li><code>apparentTemperature</code>: The apparent (or “feels like”) temperature in degrees Fahrenheit (ºF).</li>
<li><code>cloudCover</code>: The percentage of sky occluded by clouds, between <code>0</code> and <code>1</code>, inclusive.</li>
<li><code>dewPoint</code>: The dew point in degrees Fahrenheit (ºF).</li>
<li><code>humidity</code>: The relative humidity, between <code>0</code> and <code>1</code>, inclusive.</li>
<li><code>precipIntensity</code>: The intensity (in inches of liquid water per hour) of precipitation occurring at the given time (in/h)</li>
<li><code>precipType</code>:The type of precipitation occurring at the given time. If defined, this property will have one of the following values: <code>"rain"</code>, <code>"snow"</code>, or <code>"sleet"</code> (which refers to each of freezing rain, ice pellets, and “wintery mix”). (If <code>precipIntensity</code> is zero, then this property will not be defined.</li>
<li><code>pressure</code>: The sea-level air pressure in millibars (mbar or hPa).</li>
<li><code>summary</code>:A human-readable text summary of this data point.</li>
<li><code>temperature</code>:The air temperature in degrees Fahrenheit (ºF).</li>
<li><code>uvIndex</code>: The UV index, between <code>0</code> and <code>11</code>, inclusive.</li>
<li><code>visibility</code>: The average visibility in miles, capped at 10 miles (mi).</li>
<li><code>windBearing</code>: The direction that the wind is coming <em>from</em> in degrees, with true north at 0° and progressing clockwise (degrees).</li>
<li><code>windGust</code>: The wind gust speed in miles per hour (mi/h).</li>
<li><code>windSpeed</code>: The wind speed in miles per hour (mi/h).</li>
</ul>

## Feature selection

In this notebook only a selection of features will be used.<br>

<i>Building metadata</i>
<ul>
    <li>Building ID*</li>
    <li>Site ID*</li>
    <li>Primary space usage</li>
    <li>Building size (sqft)</li>
</ul>
<i>Weather data</i>
<ul>
    <li>Timestamp*</li>
    <li>Site ID*</li>
    <li><a href=https://en.wikipedia.org/wiki/Apparent_temperature>Apparent temperature</a></li>
</ul>
<i>Meter reading data</i>
<ul>
    <li>Timestamp*</li>
    <li>Building ID*</li>
    <li>meter</li>
    <li>meter reading (target)</li>
</ul>

In [2]:
path_metadata = "..\\data\\metadata\\"
path_weather = "..\\data\\weather\\"
path_meter = "..\\data\\meters\\processed\\"

In [3]:
# Meter reading data
data = pd.read_csv(path_meter + "allmeters.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53561832 entries, 0 to 53561831
Data columns (total 4 columns):
timestamp        object
building_id      object
meter_reading    float64
meter            object
dtypes: float64(1), object(3)
memory usage: 1.6+ GB


In [4]:
# Buildings metadata
metadata = pd.read_csv(path_metadata + "metadata.csv", usecols = ["building_id",
                                                                 "site_id",
                                                                 "primaryspaceusage",
                                                                 "sqft",
                                                                 ])
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1636 entries, 0 to 1635
Data columns (total 4 columns):
building_id          1636 non-null object
site_id              1636 non-null object
primaryspaceusage    1615 non-null object
sqft                 1636 non-null float64
dtypes: float64(1), object(3)
memory usage: 51.2+ KB


In [5]:
# Weather data
weather = pd.read_csv(path_weather + "weather.csv", usecols = ["timestamp",
                                                                  "site_id",
                                                                  "apparentTemperature"
                                                                  ])
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333288 entries, 0 to 333287
Data columns (total 3 columns):
timestamp              333288 non-null object
site_id                333288 non-null object
apparentTemperature    333288 non-null float64
dtypes: float64(1), object(2)
memory usage: 7.6+ MB


## Reducing memory size

In [3]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
# Reduce memory
data = reduce_mem_usage(data)
metadata = reduce_mem_usage(metadata)
weather = reduce_mem_usage(weather)

Mem. usage decreased to 1430.26 Mb (12.5% reduction)
Mem. usage decreased to  0.04 Mb (12.5% reduction)
Mem. usage decreased to  5.72 Mb (25.0% reduction)


# Feature engineering based on EDA

 <ul>
    <li><i>Healthcare</i> and <i>Utility</i> usages shows the highest meter reading values.</li>
    <li><i>Steam</i> meter shows the highest meter reading values.</li>
    <li>Monthly behaviour (meter-reading median):
    <ul>
        <li><i>Utility</i> usage peaks in April-March.</li>
        <li><i>Chilledwater</i> meter shows higher values in warm season.</li>
        <li><i>Steam</i> meter shows lower values in April-October.</li>
    </ul>
    </li>
    <li>Hourly behaviour (meter-reading median):
    <ul>
        <li>Higher values from 6 hs to 19 hs.</li>
        <li><i>Utility</i> usage shows oposite tendency.</li>
        <li><i>Steam</i> meter pikes from 5 hs to 8 hs.</li>
    </ul>
    </li>
        <li>Weekday behaviour: lowers during weekends.</li>
</ul>
</li>     
 Based on this observations <i>month</i>, <i>day of the week</i> and <i>hour of the day</i> will be added. <code>primaryspaceusage</code> categories (16) will be reduced to <i>healthcare, utility</i> and <i>other</i>. <code>meter</code> categories (8) will be preserved. Final features will be:
 <ul>
    <li>Timestamp*</li>
    <li>Building ID*</li>
    <li>Month</li>
    <li>Hour</li>
    <li>Day of the week</li>
    <li>Usage (dummy, 3 levels: <i>healthcare, utility, other</i>)</li>
    <li>Building size (sqft)</li>
    <li>Apparent temperature</li>
    <li>Meter (dummy, 8 levels)</li>
    <li>Meter reading / target</li>
    

In [8]:
# Reduce primary space usage categories to 3: Healthcare, Utilty, other
metadata.loc[(metadata["primaryspaceusage"] != "Healthcare") & (metadata["primaryspaceusage"] != "Utility"), "primaryspaceusage"] = "other"

In [9]:
# Check new categories
metadata.primaryspaceusage.unique()

array(['other', 'Healthcare', 'Utility'], dtype=object)

In [10]:
# Converts to timestamp
data["timestamp"] = pd.to_datetime(data["timestamp"], format='%Y-%m-%d %H:%M:%S')
weather["timestamp"] = pd.to_datetime(weather["timestamp"], format='%Y-%m-%d %H:%M:%S')

In [11]:
# Add features
data["month"] = data.timestamp.dt.month
data["weekday"] = data.timestamp.dt.weekday
data["hour"] = data.timestamp.dt.hour

In [12]:
# Merge datasets
data = data.merge(metadata, how="left", on="building_id").merge(weather, how="left", on = ["timestamp", "site_id"])
del(weather, metadata)

In [13]:
# Drop site_id (is no longer needed for now, is in building_id)
data = data.drop("site_id", axis=1)

In [14]:
# Drop missings?
data = data.dropna()

In [15]:
data.to_csv(path_meter + "dev_merged.csv", index=False)

# Models

 <font color="red">**To do:** list models to apply.</font> 

In [None]:
seed = 55

models = {
    'Dummy'            : DummyRegressor(),
    'Linear'           : LinearRegression(),
    'ElasticNet'       : ElasticNet(alpha=.1, random_state=seed),
    'DecisionTree'     : DecisionTreeRegressor(random_state=seed),
    'RandomForest'     : RandomForestRegressor(random_state=seed, n_jobs=-1, n_estimators=10),
    'Bagging'          : BaggingRegressor(random_state=seed, n_jobs=-1, n_estimators = 10, max_features = 1.0, max_samples = 0.25),
}

In [None]:
# This function applies all models and save each set-model prediction in seaparate csv files
# Input: train data, train target, val data, val target, satrategy name, savepath
# Output: a csv file for each model and each dataset (train and val) saved in savepath

def AllModelsPrediction(X_train, y_train, X_val, name, savepath):
    print("Start time: " + str(datetime.datetime.now()))
    print("")
    
    for model in models: # for each model in dictionary
        print("Current model: " + str(model))
        t0 = datetime.datetime.now() # start time
        
        # Fit model
        trained_model = models[model].fit(X_train, y_train.values.ravel()) # train model with train dataset
        print(str(model) +" model training complete")
        
        # Predict train and val
        val_pred = trained_model.predict(X_val) # predict val dataset
        del(trained_model)
        print(str(model) + " model prediction complete")
        
        # Save files
        pd.DataFrame(val_pred, columns=[str(model)]).to_csv(savepath + name + "_" + str(model) + ".csv", index=False)
        del(val_pred)
        print(str(model) + " model predictions saved")
        
        print(str(model) + " model total time needed: " + str(datetime.datetime.now() - t0))
        print("")
        gc.collect()
    
    print("Finish time: " + str(datetime.datetime.now()))

In [4]:
# This function concatenate all prediction in one dataframe
# input: files directory list
# output: single dataframe with all predictions

def ConcatAllModels(files):
    dfs = [] # empty list of the dataframes to create
    for file in files: # for each file in directory
        #model_name = file.split("\\")[2].split("_")[1].split(".")[0] # model_name to rename the model feature
        model = pd.read_csv(file) # load the dataset
        dfs.append(model) # append to list
    complete_data = pd.concat(dfs, axis=1, ignore_index=False, sort=False) # concatenate all models
   
    return complete_data

# Prediction

## Long term prediction: whole year 2017 prediction

**Train:** whole year 2016 (01/2016 to 12/2016)<br>
**Validation:** whole year 2017 (01/2017 to 12/2017)

In [None]:
# Load dataset
data = pd.read_csv(path_meter + "dev_merged.csv")
# Reduce memory
data = reduce_mem_usage(data)

In [None]:
# Converts to timestamp
data["timestamp"] = pd.to_datetime(data["timestamp"], format='%Y-%m-%d %H:%M:%S')

In [None]:
# Strategy split
train = data[data["timestamp"] < "2017-01-01 00:00:00"]
val = data[data["timestamp"] >= "2017-01-01 00:00:00"]
del(data)

In [None]:
train = train.set_index(["timestamp","building_id"])
val = val.set_index(["timestamp","building_id"])

### Real values

In [None]:
val_real = val[["timestamp","building_id","meter","meter_reading"]]
val_real.to_csv("..\\predictions\\longterm_real.csv", index=False)
del(val_real)

### Encoding and Data/Target split

For linear models and decision tree based models (SciKit) use OneHotEncoding:

In [None]:
# Get Dummies
train = pd.get_dummies(train, columns = ["meter","primaryspaceusage"], drop_first=True)
val = pd.get_dummies(val, columns = ["meter","primaryspaceusage"], drop_first=True)

For LightGBM use only a label encoding (this algorith can handle categorical features by itself):

In [None]:
# Label Encoder
le_meter = LabelEncoder().fit(train.meter)
train["meter"] = le_meter.transform(train.meter)
val["meter"] = le_meter.transform(val.meter)

In [None]:
# Label Encoder
le_usage = LabelEncoder().fit(train.primaryspaceusage)
train["primaryspaceusage"] = le_usage.transform(train.primaryspaceusage)
val["primaryspaceusage"] = le_usage.transform(val.primaryspaceusage)

In [None]:
# Split datasets
X_train = train.drop("meter_reading", axis=1)
y_train = train["meter_reading"]

X_val = val.drop("meter_reading", axis=1)
y_val = val["meter_reading"]

del(train, val)

### LightGBM prediction

Default parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html

In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(
    X_train,
    y_train,
    categorical_feature=["meter","primaryspaceusage"],
    free_raw_data=False,
)
lgb_eval = lgb.Dataset(
    X_val,
    y_val,
    categorical_feature=["meter","primaryspaceusage"],
    reference=lgb_train,
    free_raw_data=False,
)

In [None]:
# Parameters (Default)
params = {
    "objective": "regression",
    "metric": "rmse",
    "random_state": 55
}

In [None]:
# train
print('Starting training...')
gbm = lgb.train(params,
                lgb_train,
                valid_sets = lgb_eval,
                num_boost_round=10000,
                early_stopping_rounds=7500,
                verbose_eval=100)

In [None]:
# predict
print('Starting predicting...')
y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
print('Finish.')

In [None]:
values, counts = np.unique(y_pred, return_counts=True)

In [None]:
print(values)
print(counts)

In [None]:
# Save LightGBM prediction
pd.DataFrame(y_pred).to_csv("..\\predictions\\longterm_LGBM.csv", index=False, header="LGBM")

### SciKit-Learn models prediction

In [None]:
# Apply SciKit models
name = "s1"
savepath = "..\\predictions\\"
AllModelsPrediction(X_train, y_train, X_val, name, savepath)

### Predictions dataset

To export a single dataset with all the predictions.

In [None]:
del(X_train, X_val, y_train, y_val)
gc.collect()

In [None]:
# Filepath
files = glob("..\\predictions\\longterm*")

In [None]:
files

In [None]:
# Concatenates all predictions
df_pred = ConcatAllModels(files)

In [None]:
df_pred = df_pred[["timestamp","building_id","meter","meter_reading","LGBM"]]

In [None]:
df_pred.head()

In [None]:
# Saves files
df_pred.to_csv("..\\predictions\\longterm_pred.csv", index=False)

## Short term prediction: weekly prediction

**train:**<br>
**validation:**

In [None]:
# Load dataset
data = pd.read_csv(path_meter + ".csv")
# Reduce memory
data = reduce_mem_usage(data)

In [None]:
# Converts to timestamp
data["timestamp"] = pd.to_datetime(data["timestamp"], format='%Y-%m-%d %H:%M:%S')

In [None]:
# Strategy split
train = data[data["timestamp"] < ]
val = data[(data["timestamp"] >= ) & (data["timestamp"] < )]
del(data)

#### Real values

In [None]:
val_real = val[["timestamp","building_id","meter","meter_reading"]]
val_real.to_csv("..\\predictions\\shortterm_real.csv", index=False)
del(val_real)

#### Encoding and Data/Target split

In [None]:
train = train.set_index(["timestamp","building_id"])
val = val.set_index(["timestamp","building_id"])

For linear models and decision tree based models (SciKit) use OneHotEncoding:

In [None]:
# Get Dummies
train = pd.get_dummies(train, columns = ["meter","primaryspaceusage"], drop_first=True)
val = pd.get_dummies(val, columns = ["meter","primaryspaceusage"], drop_first=True)

For LightGBM use only a label encoding (this algorithm can handle categorical features by itself):

In [None]:
# Label Encoder
le_meter = LabelEncoder().fit(train.meter)
train["meter"] = le_meter.transform(train.meter)
val["meter"] = le_meter.transform(val.meter)

In [None]:
# Label Encoder
le_usage = LabelEncoder().fit(train.primaryspaceusage)
train["primaryspaceusage"] = le_usage.transform(train.primaryspaceusage)
val["primaryspaceusage"] = le_usage.transform(val.primaryspaceusage)

In [None]:
# Split datasets
X_train = train.drop("meter_reading", axis=1)
y_train = train["meter_reading"]

X_val = val.drop("meter_reading", axis=1)
y_val = val["meter_reading"]

del(train, val)

#### LightGBM prediction

Default parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html

In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(
    X_train,
    y_train,
    categorical_feature=["meter","primaryspaceusage"],
    free_raw_data=False,
)
lgb_eval = lgb.Dataset(
    X_val,
    y_val,
    categorical_feature=["meter","primaryspaceusage"],
    reference=lgb_train,
    free_raw_data=False,
)

In [None]:
# Parameters (Default)
params = {"objective": "regression",
          "metric":"rmse",
          "random_state": 55}

In [None]:
# train
print('Starting training...')
gbm = lgb.train(params,
                lgb_train,
                valid_sets = lgb_eval,
                early_stopping_rounds=100)

In [None]:
# predict
print('Starting predicting...')
y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
print('Finish.')

In [None]:
# Save LightGBM prediction
pd.DataFrame(y_pred, columns=['LGBM']).to_csv("..\\predictions\\shortterm_LGBM.csv", index=False, header=True)

### SciKit-Learn models prediction

In [None]:
# Apply SciKit models
name = "s2a"
savepath = "..\\predictions\\"
AllModelsPrediction(X_train, y_train, X_val, name, savepath)

### Predictions dataset

To export a single dataset with all the predictions.

In [None]:
del(X_train, X_val, y_train, y_val, y_pred)
gc.collect()

In [None]:
# Filepath
files = glob("..\\predictions\\shortterm*")

In [None]:
files

In [None]:
# Concatenates all predictions
df_pred = ConcatAllModels(files)

In [None]:
df_pred = df_pred[["timestamp","building_id","meter","meter_reading","LGBM"]]

In [None]:
df_pred.head()

In [None]:
# Saves files
df_pred.to_csv("", index=False)