# Master Sci-kit Learn Models for Building Energy Modelling

- Clayton Miller - clayton@nus.edu.sg
- Kairat Talentbekov

This notebook is main model training/testing notebook based on the prototypes

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit

## Loading to the main temporal and meta data files

In [2]:
meta = pd.read_csv("../input/meta_open.csv", index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)
temporal = pd.read_csv("../input/temp_open_utc_complete.csv", index_col='timestamp', parse_dates=True).tz_localize('utc')

In [3]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 507 entries, Office_Abbey to UnivLab_Tracy
Data columns (total 19 columns):
dataend                   507 non-null datetime64[ns]
datastart                 507 non-null datetime64[ns]
energystarscore           26 non-null float64
heatingtype               124 non-null object
industry                  507 non-null object
mainheatingtype           122 non-null object
numberoffloors            124 non-null float64
occupants                 105 non-null float64
primaryspaceusage         507 non-null object
rating                    131 non-null object
sqft                      507 non-null float64
sqm                       507 non-null float64
subindustry               507 non-null object
timezone                  507 non-null object
yearbuilt                 313 non-null object
nickname                  507 non-null object
primaryspaceuse_abbrev    507 non-null object
newweatherfilename        507 non-null object
annualschedule            482 n

## Regression model library from Scikit-Learn libary

In [457]:
# All models types
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
from  sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import TheilSenRegressor
    
# Make array of models. Each model is an array of two elements.
# First element is a model-name, second is a model itself
models = [['RandomForestRegressor', RandomForestRegressor(n_estimators = 1000, random_state = 42)],
['AdaBoostRegressor', AdaBoostRegressor(n_estimators = 1000, random_state = 42)],
['BaggingRegressor', BaggingRegressor(n_estimators = 1000, random_state = 42)],
['DecisionTreeRegressor', DecisionTreeRegressor(random_state = 42)],
['DummyRegressor', DummyRegressor()],
['ExtraTreeRegressor', ExtraTreeRegressor(random_state = 42)],
['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators = 1000, random_state = 42)],
['GaussianProcessRegressor', GaussianProcessRegressor(random_state = 42)],
['GradientBoostingRegressor', GradientBoostingRegressor(n_estimators = 1000, random_state = 42)],
['HuberRegressor', HuberRegressor()],
['KNeighborsRegressor', KNeighborsRegressor()],
['MLPRegressor', MLPRegressor(random_state = 42)],
['PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state = 42)],
['RANSACRegressor', RANSACRegressor(random_state = 42)],
['SGDRegressor', SGDRegressor(random_state = 42)],
['TheilSenRegressor', TheilSenRegressor(random_state = 42)]
]

### Functions for loading and processing data

In [458]:
def load_energy_data(meta, singlebuilding):
    # Get Data
    single_timezone = meta.T[singlebuilding].timezone
    single_start = meta.T[singlebuilding].datastart
    single_end = meta.T[singlebuilding].dataend
    return pd.DataFrame(temporal[singlebuilding].tz_convert(single_timezone).truncate(before=single_start,after=single_end))

In [459]:
def load_weather_data(meta, singlebuilding, single_building_data, weatherpoint):
    # Get weather file
    single_timezone = meta.T[singlebuilding].timezone
    weatherfilename = meta.T[singlebuilding].newweatherfilename
    #print("Weatherfile: "+weatherfilename)
    weather = pd.read_csv(os.path.join("../input/",weatherfilename),index_col='timestamp', parse_dates=True, na_values='-9999')
    weather = weather.tz_localize(single_timezone, ambiguous = 'infer')
    point_data = pd.DataFrame(weather[[col for col in weather.columns if weatherpoint in col]]).resample("H").mean()
    return point_data.reindex(pd.DatetimeIndex(start=point_data.index[0], periods=len(single_building_data), freq="H")).fillna(method='ffill').fillna(method='bfill')

In [460]:
def load_seasonal_schedule(meta, singlebuilding, single_building_data):
    schedulefilename = meta.T[singlebuilding].annualschedule
    single_timezone = meta.T[singlebuilding].timezone
    schedule = pd.read_csv(os.path.join("../input/",schedulefilename), header=None, parse_dates=True, index_col=0)
    schedule = schedule.tz_localize(single_timezone, ambiguous = 'infer')
    schedule.columns = ["seasonal"]
    return schedule.reindex(pd.DatetimeIndex(start=schedule.index[0], periods=len(single_building_data), freq="H")).fillna(method='ffill').fillna(method='bfill')
    
    

# Demo of a single building

In [461]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 507 entries, Office_Abbey to UnivLab_Tracy
Data columns (total 19 columns):
dataend                   507 non-null datetime64[ns]
datastart                 507 non-null datetime64[ns]
energystarscore           26 non-null float64
heatingtype               124 non-null object
industry                  507 non-null object
mainheatingtype           122 non-null object
numberoffloors            124 non-null float64
occupants                 105 non-null float64
primaryspaceusage         507 non-null object
rating                    131 non-null object
sqft                      507 non-null float64
sqm                       507 non-null float64
subindustry               507 non-null object
timezone                  507 non-null object
yearbuilt                 313 non-null object
nickname                  507 non-null object
primaryspaceuse_abbrev    507 non-null object
newweatherfilename        507 non-null object
annualschedule            482 n

In [462]:
singlebuilding = "Office_Benthe"

In [463]:
single_building_data = load_energy_data(meta, singlebuilding)

In [464]:
single_building_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8784 entries, 2012-01-01 00:00:00-08:00 to 2012-12-31 23:00:00-08:00
Data columns (total 1 columns):
Office_Benthe    8784 non-null float64
dtypes: float64(1)
memory usage: 137.2 KB


In [465]:
outdoor_temp = load_weather_data(meta, singlebuilding, single_building_data, "Temperature")
outdoor_humidity = load_weather_data(meta, singlebuilding, single_building_data, "Humidity")

In [466]:
outdoor_humidity.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8784 entries, 2012-01-01 00:00:00-08:00 to 2012-12-31 23:00:00-08:00
Freq: H
Data columns (total 1 columns):
Humidity    8784 non-null float64
dtypes: float64(1)
memory usage: 137.2 KB


In [467]:
schedule = load_seasonal_schedule(meta, singlebuilding, single_building_data)

In [468]:
schedule.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8784 entries, 2012-01-01 00:00:00-08:00 to 2012-12-31 23:00:00-08:00
Freq: H
Data columns (total 1 columns):
seasonal    8784 non-null object
dtypes: object(1)
memory usage: 137.2+ KB


In [469]:
months = np.array([single_building_data.index.month.unique()])[0]
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

In [470]:
tscv

TimeSeriesSplit(max_train_size=None, n_splits=3)

In [471]:
for train_index, test_index in tscv.split(months):
    month_train, month_test = months[train_index], months[test_index]
    print(month_train, month_test)

[1 2 3] [4 5 6]
[1 2 3 4 5 6] [7 8 9]
[1 2 3 4 5 6 7 8 9] [10 11 12]


In [472]:
np.concatenate([months[0:3], months[4:7], months[8:11]])

array([ 1,  2,  3,  5,  6,  7,  9, 10, 11])

In [473]:
np.array([months[4], months[7], months[11]])

array([ 5,  8, 12])

In [474]:
def create_train_test_indices(months):
    train_test_lists = []
    
    #Get time-series split version
    n_splits = 3
    tscv = TimeSeriesSplit(n_splits=n_splits)
    for train_index, test_index in tscv.split(months):
        month_train, month_test = months[train_index], months[test_index]
        train_test_lists.append([month_train, month_test])
        
    #Add the 'every-fourth-month' version
    train_test_lists.append([np.concatenate([months[0:3], months[4:7], 
                                            months[8:11]]), np.array([months[4], months[7], months[11]])])
    
    return train_test_lists

In [475]:
train_test_lists = create_train_test_indices(months)

In [476]:
for train_index, test_index in train_test_lists:  
    print(train_index, test_index)

[1 2 3] [4 5 6]
[1 2 3 4 5 6] [7 8 9]
[1 2 3 4 5 6 7 8 9] [10 11 12]
[ 1  2  3  5  6  7  9 10 11] [ 5  8 12]


In [477]:
months = train_index

In [478]:

# data = single_building_data[single_building_data.index.month.isin(months)]


# features = pd.concat((pd.get_dummies(data.index.hour),
#                       pd.get_dummies(data.index.dayofweek),
#                       pd.Series(outdoor_temp[outdoor_temp.index.month.isin(months)].TemperatureC.values),
#                       pd.Series(outdoor_humidity[outdoor_humidity.index.month.isin(months)].Humidity.values),
#                       pd.get_dummies(schedule[schedule.index.month.isin(months)][1].values)), axis=1)
# #features = features.fillna(method='ffill').fillna(method='bfill')
# #features = np.array(features)

# labels = data[singlebuilding]#.values

In [479]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6576 entries, 2012-01-01 00:00:00-08:00 to 2012-11-30 23:00:00-08:00
Data columns (total 7 columns):
energy              6576 non-null float64
TemperatureC        6576 non-null float64
Humidity            6576 non-null float64
seasonal_Break      6576 non-null uint8
seasonal_Holiday    6576 non-null uint8
seasonal_Regular    6576 non-null uint8
seasonal_Summer     6576 non-null uint8
dtypes: float64(3), uint8(4)
memory usage: 231.2 KB


In [480]:
#labels

In [481]:
data = single_building_data[single_building_data.index.month.isin(months)][singlebuilding]

In [482]:
data.index.hour

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
            14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
           dtype='int64', name='timestamp', length=6576)

In [483]:
data = pd.merge(pd.DataFrame({"energy":data}), outdoor_temp, right_index=True, left_index=True)
data = pd.merge(data, outdoor_humidity, right_index=True, left_index=True)
data = pd.merge(data, pd.get_dummies(schedule), right_index=True, left_index=True)
# data = pd.merge(data, pd.get_dummies(data.index.hour), right_index=True, left_index=True)

In [484]:
data.columns

Index(['energy', 'TemperatureC', 'Humidity', 'seasonal_Break',
       'seasonal_Holiday', 'seasonal_Regular', 'seasonal_Summer'],
      dtype='object')

In [485]:
def get_features_and_labels(single_building_data, singlebuilding, outdoor_temp, outdoor_humidity, schedule, months):
        data = single_building_data[single_building_data.index.month.isin(months)][singlebuilding]
        data = pd.merge(pd.DataFrame({"energy":data}), outdoor_temp, right_index=True, left_index=True)
        data = pd.merge(data, outdoor_humidity, right_index=True, left_index=True)
        data = pd.merge(data, pd.get_dummies(schedule), right_index=True, left_index=True)
        
        features = pd.concat((pd.get_dummies(data.index.hour),
                              pd.get_dummies(data.index.dayofweek),
                              data.drop(["energy"], axis=1).reset_index(drop=True)),axis=1)
        
        features = features.fillna(method='ffill').fillna(method='bfill')
        
        
        labels = data["energy"].values
        
        features = np.array(features)
        
        return features, labels
    
    

In [486]:
train_features, train_labels = get_features_and_labels(single_building_data, singlebuilding, outdoor_temp, outdoor_humidity, schedule, train_index)
test_features, test_labels = get_features_and_labels(single_building_data, singlebuilding, outdoor_temp, outdoor_humidity, schedule, test_index)

In [487]:
#train_features.info()

In [488]:
len(train_labels)

6576

In [489]:
testmodel = DummyRegressor()
testmodel.fit(train_features, train_labels)

DummyRegressor(constant=None, quantile=None, strategy='mean')

In [490]:
predictions = testmodel.predict(test_features)

In [491]:
predictions

array([202.26538448, 202.26538448, 202.26538448, ..., 202.26538448,
       202.26538448, 202.26538448])

In [492]:
#list(buildingnames)

## Loop through

In [493]:
def createMetrics(modelName, model, buildingnames):
    print('\n\n' + modelName + '\n_____________')
    # buidingindex
    buildingindex = 0
    
    for singlebuilding in buildingnames:
        buildingindex+=1
        print("Modelling: " + singlebuilding)
        
        # Get energy data
        single_building_data = load_energy_data(meta, singlebuilding)
     
        # Get weather and schedules
        outdoor_temp = load_weather_data(meta, singlebuilding, single_building_data, "Temperature")
        outdoor_humidity = load_weather_data(meta, singlebuilding, single_building_data, "Humidity")
        schedule = load_seasonal_schedule(meta, singlebuilding, single_building_data)
    
        # Test/Train cycle
        months = np.array([single_building_data.index.month.unique()])[0]
        train_test_lists = create_train_test_indices(months)

        
        index = 0
        for train_index, test_index in train_test_lists:    
                        
            # Get Training Data
            train_features, train_labels = get_features_and_labels(single_building_data, singlebuilding, outdoor_temp, outdoor_humidity, schedule, train_index)
            
            # Create test data array
            test_features, test_labels = get_features_and_labels(single_building_data, singlebuilding, outdoor_temp, outdoor_humidity, schedule, test_index)

            # Train the model on training data
            mainmodel = model
            mainmodel.fit(train_features, train_labels);

            # Use the forest's predict method on the test data
            predictions = mainmodel.predict(test_features)


            # Calculate the absolute errors
            errors = abs(predictions - test_labels)
            # Calculate mean absolute percentage error (MAPE) and add to list
            MAPE = 100 * np.mean((errors / test_labels))
            NMBE = 100 * (sum(test_labels - predictions) / (pd.Series(test_labels).count() * np.mean(test_labels)))
            CVRSME = 100 * ((sum((test_labels - predictions)**2) / (pd.Series(test_labels).count()-1))**(0.5)) / np.mean(test_labels)
            RSQUARED = r2_score(test_labels, predictions)

            index+=1
            if(buildingindex == 1):
                temporary = pd.DataFrame(columns=["building", "MAPE", "NMBE", "CVRSME", "RSQUARED"])
                temporary.to_csv('../results/' + modelName + '_metrics_cross_validation_' + str(index) + '.csv', index=False)
            # Read dataframe with particular step (cross validation) 
            metrics_prev = pd.read_csv('../results/' + modelName + '_metrics_cross_validation_' + str(index) + '.csv')
            df = pd.DataFrame([[singlebuilding, MAPE, NMBE, CVRSME, RSQUARED]],columns=['building','MAPE','NMBE','CVRSME','RSQUARED'])
            # Append new row
            metrics = pd.concat([df, metrics_prev])
            metrics.to_csv('../results/' + modelName + '_metrics_cross_validation_' + str(index) + '.csv', index=False)


In [494]:
buildingnames = meta.dropna(subset=['annualschedule']).index

In [495]:
buildingnames

Index(['Office_Abbey', 'Office_Abigail', 'Office_Al', 'Office_Alannah',
       'Office_Aliyah', 'Office_Allyson', 'Office_Alyson', 'Office_Amelia',
       'Office_Amelie', 'Office_Anastasia',
       ...
       'UnivLab_Preston', 'UnivLab_Priscilla', 'UnivLab_Santiago',
       'UnivLab_Susan', 'UnivLab_Suzette', 'UnivLab_Tami', 'UnivLab_Taylor',
       'UnivLab_Terrie', 'UnivLab_Tracie', 'UnivLab_Tracy'],
      dtype='object', name='uid', length=482)

In [496]:
MAPE_data = {}
RSQUARED_data = {}
NMBE_data = {}
CVRSME_data = {}
for elem in models:
    # modelName = elem[0], model = elem[1]
    createMetrics(elem[0], elem[1], buildingnames)
    



RandomForestRegressor
_____________
Modelling: Office_Abbey
Modelling: Office_Abigail
Modelling: Office_Al
Modelling: Office_Alannah
Modelling: Office_Aliyah
Modelling: Office_Allyson
Modelling: Office_Alyson
Modelling: Office_Amelia
Modelling: Office_Amelie
Modelling: Office_Anastasia
Modelling: Office_Andrea
Modelling: Office_Angelica
Modelling: Office_Angelina
Modelling: Office_Angelo
Modelling: Office_Annika
Modelling: Office_Ashanti
Modelling: Office_Asher
Modelling: Office_Aubrey
Modelling: Office_Autumn
Modelling: Office_Ava
Modelling: Office_Ayden
Modelling: Office_Ayesha
Modelling: Office_Benjamin
Modelling: Office_Benthe
Modelling: Office_Bianca
Modelling: Office_Bobbi
Modelling: Office_Brian
Modelling: Office_Bryon
Modelling: Office_Caleb
Modelling: Office_Cameron
Modelling: Office_Carissa
Modelling: Office_Carolina
Modelling: Office_Catherine
Modelling: Office_Cecelia
Modelling: Office_Charles
Modelling: Office_Clarissa
Modelling: Office_Clifton
Modelling: Office_Clinton


Modelling: UnivClass_Stephanie
Modelling: UnivClass_Stephen
Modelling: UnivClass_Stuart
Modelling: UnivClass_Sylvia
Modelling: UnivClass_Tammy
Modelling: UnivClass_Tamra
Modelling: UnivClass_Teri
Modelling: UnivClass_Therese
Modelling: UnivDorm_Adan
Modelling: UnivDorm_Adriana
Modelling: UnivDorm_Ahmad
Modelling: UnivDorm_Alex
Modelling: UnivDorm_Alka
Modelling: UnivDorm_Alonzo
Modelling: UnivDorm_Alphonso
Modelling: UnivDorm_Alyshialynn
Modelling: UnivDorm_Alyssa
Modelling: UnivDorm_Antonio
Modelling: UnivDorm_April
Modelling: UnivDorm_Ashleigh
Modelling: UnivDorm_Avery
Modelling: UnivDorm_Camila
Modelling: UnivDorm_Candace
Modelling: UnivDorm_Cara
Modelling: UnivDorm_Carey
Modelling: UnivDorm_Carla
Modelling: UnivDorm_Carter
Modelling: UnivDorm_Casey
Modelling: UnivDorm_Cathal
Modelling: UnivDorm_Cathalina
Modelling: UnivDorm_Cecilia
Modelling: UnivDorm_Celeste
Modelling: UnivDorm_Chelsey
Modelling: UnivDorm_Cheri
Modelling: UnivDorm_Chester
Modelling: UnivDorm_Cheyenne
Modelling: Un

Modelling: Office_Precious
Modelling: Office_Scottie
Modelling: Office_Shari
Modelling: Office_Shawnette
Modelling: Office_Shelly
Modelling: Office_Sinead
Modelling: Office_Skyler
Modelling: Office_Stella
Modelling: Office_Terrell
Modelling: Office_Tod
Modelling: Office_Travis
Modelling: PrimClass_Angel
Modelling: PrimClass_Angela
Modelling: PrimClass_Jacob
Modelling: PrimClass_Jacqueline
Modelling: PrimClass_Jacquelyn
Modelling: PrimClass_Jaden
Modelling: PrimClass_Jaiden
Modelling: PrimClass_Jake
Modelling: PrimClass_Jamal
Modelling: PrimClass_Jamie
Modelling: PrimClass_Jane
Modelling: PrimClass_Janelle
Modelling: PrimClass_Janet
Modelling: PrimClass_Janice
Modelling: PrimClass_Janie
Modelling: PrimClass_Janis
Modelling: PrimClass_Janiya
Modelling: PrimClass_Jaqueline
Modelling: PrimClass_Jarrett
Modelling: PrimClass_Jasmine
Modelling: PrimClass_Javier
Modelling: PrimClass_Jaxson
Modelling: PrimClass_Jayda
Modelling: PrimClass_Jayla
Modelling: PrimClass_Jaylin
Modelling: PrimClass_Ja

Modelling: UnivLab_Crystal
Modelling: UnivLab_Dianna
Modelling: UnivLab_Lauren
Modelling: UnivLab_Lea
Modelling: UnivLab_Lee
Modelling: UnivLab_Lester
Modelling: UnivLab_Levi
Modelling: UnivLab_Lilly
Modelling: UnivLab_Louie
Modelling: UnivLab_Lyle
Modelling: UnivLab_Mack
Modelling: UnivLab_Madelyn
Modelling: UnivLab_Margret
Modelling: UnivLab_Mariana
Modelling: UnivLab_Marie
Modelling: UnivLab_Mario
Modelling: UnivLab_Marshall
Modelling: UnivLab_Miles
Modelling: UnivLab_Neil
Modelling: UnivLab_Paris
Modelling: UnivLab_Parker
Modelling: UnivLab_Patrick
Modelling: UnivLab_Patsy
Modelling: UnivLab_Paul
Modelling: UnivLab_Peggy
Modelling: UnivLab_Peyton
Modelling: UnivLab_Phil
Modelling: UnivLab_Preston
Modelling: UnivLab_Priscilla
Modelling: UnivLab_Santiago
Modelling: UnivLab_Susan
Modelling: UnivLab_Suzette
Modelling: UnivLab_Tami
Modelling: UnivLab_Taylor
Modelling: UnivLab_Terrie
Modelling: UnivLab_Tracie
Modelling: UnivLab_Tracy


BaggingRegressor
_____________
Modelling: Office_Abb

KeyboardInterrupt: 