In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import r2_score

In [2]:
meta = pd.read_csv("../input/meta_open.csv", index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)
temporal = pd.read_csv("../input/temp_open_utc_complete.csv", index_col='timestamp', parse_dates=True).tz_localize('utc')

In [3]:
buildingnames = temporal.columns[temporal.columns.str.contains("Office")]

In [4]:
buildingnames

Index(['Office_Cristina', 'Office_Jesus', 'Office_Jett', 'Office_Jerry',
       'Office_Lesa', 'Office_Jackie', 'Office_Marla', 'Office_Maryann',
       'Office_Myron', 'Office_Conrad',
       ...
       'Office_Ellie', 'Office_Erik', 'Office_Evelyn', 'Office_Emer',
       'Office_Elena', 'Office_Emerald', 'Office_Ellis', 'Office_Elliot',
       'Office_Eddie', 'Office_Georgia'],
      dtype='object', length=156)

In [5]:
# Import all models we are using
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
from  sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import TheilSenRegressor
    
# Make array of models. Each model is an array of two elements.
# First element is a model-name, second is a model itself
models = [['AdaBoostRegressor', AdaBoostRegressor(n_estimators = 1000, random_state = 42)],
['BaggingRegressor', BaggingRegressor(n_estimators = 1000, random_state = 42)],
['DecisionTreeRegressor', DecisionTreeRegressor(random_state = 42)],
['DummyRegressor', DummyRegressor()],
['ExtraTreeRegressor', ExtraTreeRegressor(random_state = 42)],
['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators = 1000, random_state = 42)],
['GaussianProcessRegressor', GaussianProcessRegressor(random_state = 42)],
['GradientBoostingRegressor', GradientBoostingRegressor(n_estimators = 1000, random_state = 42)],
['HuberRegressor', HuberRegressor()],
['KNeighborsRegressor', KNeighborsRegressor()],
['MLPRegressor', MLPRegressor(random_state = 42)],
['PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state = 42)],
['RANSACRegressor', RANSACRegressor(random_state = 42)],
['SGDRegressor', SGDRegressor(random_state = 42)],
['TheilSenRegressor', TheilSenRegressor(random_state = 42)]]

  from numpy.core.umath_tests import inner1d


In [6]:
# Metrics function
def analysis(modelName, model):
    # Train the model on training data
    model.fit(train_features, train_labels);
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)

    # Calculate the absolute errors
    errors = abs(predictions - test_labels)
    # Calculate mean absolute percentage error (MAPE) and add to list
    MAPE = 100 * np.mean((errors / test_labels))
    NMBE = 100 * (sum(test_labels - predictions) / (pd.Series(test_labels).count() * np.mean(test_labels)))
    CVRSME = 100 * ((sum((test_labels - predictions)**2) / (pd.Series(test_labels).count()-1))**(0.5)) / np.mean(test_labels)
    RSQUARED = r2_score(test_labels, predictions)

    print("MAPE: "+str(MAPE))
    print("NMBE: "+str(NMBE))
    print("CVRSME: "+str(CVRSME))
    print("R SQUARED: "+str(RSQUARED))

    MAPE_data[singlebuilding] = MAPE
    NMBE_data[singlebuilding] = NMBE
    CVRSME_data[singlebuilding] = CVRSME
    RSQUARED_data[singlebuilding] = RSQUARED

    metrics = pd.DataFrame([MAPE_data, NMBE_data, CVRSME_data, RSQUARED_data]).T
    metrics.columns = ["MAPE", "NMBE", "CVRSME", "RSQUARED"]
    #metrics
    metrics.to_csv(modelName + '_metrics.csv')

In [7]:
MAPE_data = {}
RSQUARED_data = {}
NMBE_data = {}
CVRSME_data = {}

for singlebuilding in buildingnames[:]:
    print("Modelling: " + singlebuilding)
    # Get Data
    single_timezone = meta.T[singlebuilding].timezone
    single_start = meta.T[singlebuilding].datastart
    single_end = meta.T[singlebuilding].dataend
    single_building_data = pd.DataFrame(temporal[singlebuilding].tz_convert(single_timezone).truncate(before=single_start,after=single_end))

    # Split into Training and Testing
    trainingdata = single_building_data[single_building_data.index.month.isin(["1","2","3","5","6","7","9","10","11"])]
    testdata = single_building_data[single_building_data.index.month.isin(["4","8","12"])]

    # Get weather file
    weatherfilename = meta.T[singlebuilding].newweatherfilename
    print("Weatherfile: "+weatherfilename)
    weather = pd.read_csv(os.path.join("../input/",weatherfilename),index_col='timestamp', parse_dates=True, na_values='-9999')
    weather = weather.tz_localize(single_timezone, ambiguous = 'infer')
    outdoor_temp = pd.DataFrame(weather[[col for col in weather.columns if 'Temperature' in col]]).resample("H").mean()
    outdoor_temp = outdoor_temp.reindex(pd.DatetimeIndex(start=outdoor_temp.index[0], periods=len(single_building_data), freq="H")).fillna(method='ffill').fillna(method='bfill')

    # Create training data array
    train_features = np.array(pd.concat([pd.get_dummies(trainingdata.index.hour),
                                         pd.get_dummies(trainingdata.index.dayofweek),
               pd.Series(outdoor_temp[outdoor_temp.index.month.isin(["1","2","3","5","6","7","9","10","11"])].TemperatureC.values)], axis=1))
    train_labels = np.array(trainingdata[singlebuilding].values)

    # Create test data array
    test_features = np.array(pd.concat([pd.get_dummies(testdata.index.hour),
                                         pd.get_dummies(testdata.index.dayofweek),
               pd.Series(outdoor_temp[outdoor_temp.index.month.isin(["4","8","12"])].TemperatureC.values)], axis=1))
    test_labels = np.array(testdata[singlebuilding].values)
    
    
    for elem in models:
        modelName = elem[0]
        model = elem[1]
        analysis(modelName, model)


Modelling: Office_Cristina
Weatherfile: weather2.csv
MAPE: 27.086057495078965
NMBE: 1.0166219886345862
CVRSME: 28.589986502921278
R SQUARED: 0.18218026965653822
MAPE: 17.858183102422007
NMBE: 0.4656609718135961
CVRSME: 21.38615456575262
R SQUARED: 0.5423904430701022
MAPE: 21.384634485764074
NMBE: 0.517152228279596
CVRSME: 25.316739821903553
R SQUARED: 0.3587236256494244
MAPE: 31.48652791375246
NMBE: -1.4879135920651274
CVRSME: 31.649425974673346
R SQUARED: -0.0022160616661179855
MAPE: 21.199458106454106
NMBE: 0.45601705302225254
CVRSME: 25.044352416553643
R SQUARED: 0.37244860964320503
MAPE: 19.391880912638175
NMBE: 0.31110421090563156
CVRSME: 23.17412809017922
R SQUARED: 0.4626756902216318
MAPE: 491.76401191704684
NMBE: -34.16205833864936
CVRSME: 1439.5829752350294
R SQUARED: -2072.4954524059704
MAPE: 16.23432854530993
NMBE: 0.6764609008623663
CVRSME: 19.45383065676439
R SQUARED: 0.6213482699709816
MAPE: 20.65202885301864
NMBE: 0.1001960325641531
CVRSME: 22.530352628320994
R SQUARED: 



MAPE: 21.183971589906896
NMBE: 3.1230423681493504
CVRSME: 23.341575626790274
R SQUARED: 0.45488262983112226
MAPE: 21.14803887039919
NMBE: 2.197376546955598
CVRSME: 23.01944047122325
R SQUARED: 0.46982504339846154
Modelling: Office_Jesus
Weatherfile: weather1.csv
MAPE: 143.98697775733626
NMBE: 4.097141967907661
CVRSME: 31.455292481301637
R SQUARED: 0.330548305538116
MAPE: 131.01868771972406
NMBE: 8.172203866096709
CVRSME: 31.066091793328617
R SQUARED: 0.34701225309398087
MAPE: 130.94593439821443
NMBE: 8.985014310236568
CVRSME: 34.0551613661976
R SQUARED: 0.21531080121944068
MAPE: 147.472717445552
NMBE: 9.316838322023584
CVRSME: 39.557826365804324
R SQUARED: -0.05875781584640505
MAPE: 133.63532064910333
NMBE: 8.831195902194585
CVRSME: 35.02563789605391
R SQUARED: 0.16995068274967196
MAPE: 131.3697755558292
NMBE: 8.576143794149983
CVRSME: 32.82315520352936
R SQUARED: 0.27105889251576154
MAPE: 184.46687848774855
NMBE: 22.525918992863506
CVRSME: 312.3900610919685
R SQUARED: -65.027705527557



MAPE: 152.87857827020383
NMBE: 3.3643004865603876
CVRSME: 28.92837994417283
R SQUARED: 0.43378676665678695
MAPE: 143.94642722656306
NMBE: 9.890877806642619
CVRSME: 31.734906426762116
R SQUARED: 0.31859355990972216
