In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit

In [3]:
meta = pd.read_csv("../input/meta_open.csv", index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)
temporal = pd.read_csv("../input/temp_open_utc_complete.csv", index_col='timestamp', parse_dates=True).tz_localize('utc')

In [4]:
buildingnames = temporal.columns[temporal.columns.str.contains("Office")]

In [5]:
# Import all models we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
from  sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import TheilSenRegressor
    
# Make array of models. Each model is an array of two elements.
# First element is a model-name, second is a model itself
models = [#['RandomForestRegressor', RandomForestRegressor(n_estimators = 1000, random_state = 42)],
#['AdaBoostRegressor', AdaBoostRegressor(n_estimators = 1000, random_state = 42)],
#['BaggingRegressor', BaggingRegressor(n_estimators = 1000, random_state = 42)],
#['DecisionTreeRegressor', DecisionTreeRegressor(random_state = 42)],
['DummyRegressor', DummyRegressor()],
['ExtraTreeRegressor', ExtraTreeRegressor(random_state = 42)],
['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators = 1000, random_state = 42)],
#['GaussianProcessRegressor', GaussianProcessRegressor(random_state = 42)],
#['GradientBoostingRegressor', GradientBoostingRegressor(n_estimators = 1000, random_state = 42)],
#['HuberRegressor', HuberRegressor()],
#['KNeighborsRegressor', KNeighborsRegressor()],
#['MLPRegressor', MLPRegressor(random_state = 42)],
#['PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state = 42)],
#['RANSACRegressor', RANSACRegressor(random_state = 42)],
#['SGDRegressor', SGDRegressor(random_state = 42)],
#['TheilSenRegressor', TheilSenRegressor(random_state = 42)]
]

  from numpy.core.umath_tests import inner1d


In [6]:
# Produce file with metrics(MAPE, NMBE, CVRSME, RSQUARED) based on provided model
# Results will be saved as modelName_metrics.csv
def createMetrics(modelName, model):
    print('\n\n' + modelName + '\n_____________')
    for singlebuilding in buildingnames[:]:
        print("Modelling: " + singlebuilding)
        # Get Data
        single_timezone = meta.T[singlebuilding].timezone
        single_start = meta.T[singlebuilding].datastart
        single_end = meta.T[singlebuilding].dataend
        single_building_data = pd.DataFrame(temporal[singlebuilding].tz_convert(single_timezone).truncate(before=single_start,after=single_end))
        
        # set to zero, we will calculate average in the end
        MAPE_sum = 0
        NMBE_sum = 0
        CVRSME_sum = 0
        RSQUARED_sum = 0
            
        # split time series data samples
        months = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
        n_splits = 5
        tscv = TimeSeriesSplit(n_splits=n_splits)
        
        # Get weather file
        weatherfilename = meta.T[singlebuilding].newweatherfilename
        print("Weatherfile: "+weatherfilename)
        weather = pd.read_csv(os.path.join("../input/",weatherfilename),index_col='timestamp', parse_dates=True, na_values='-9999')
        weather = weather.tz_localize(single_timezone, ambiguous = 'infer')
        outdoor_temp = pd.DataFrame(weather[[col for col in weather.columns if 'Temperature' in col]]).resample("H").mean()
        outdoor_temp = outdoor_temp.reindex(pd.DatetimeIndex(start=outdoor_temp.index[0], periods=len(single_building_data), freq="H")).fillna(method='ffill').fillna(method='bfill')

        for train_index, test_index in tscv.split(months):
            month_train, month_test = months[train_index], months[test_index]
            
            # Split into Training and Testing
            trainingdata = single_building_data[single_building_data.index.month.isin(month_train)]
            testdata = single_building_data[single_building_data.index.month.isin(month_test)]

           
            # Create training data array
            train_features = np.array(pd.concat([pd.get_dummies(trainingdata.index.hour),
                                                 pd.get_dummies(trainingdata.index.dayofweek),
                       pd.Series(outdoor_temp[outdoor_temp.index.month.isin(month_train)].TemperatureC.values)], axis=1))
            train_labels = np.array(trainingdata[singlebuilding].values)

            # Create test data array
            test_features = np.array(pd.concat([pd.get_dummies(testdata.index.hour),
                                                 pd.get_dummies(testdata.index.dayofweek),
                       pd.Series(outdoor_temp[outdoor_temp.index.month.isin(month_test)].TemperatureC.values)], axis=1))
            test_labels = np.array(testdata[singlebuilding].values)


            # Train the model on training data
            model.fit(train_features, train_labels);
            # Use the forest's predict method on the test data
            predictions = model.predict(test_features)

            # Calculate the absolute errors
            errors = abs(predictions - test_labels)
            # Calculate mean absolute percentage error (MAPE) and add to list
            MAPE = 100 * np.mean((errors / test_labels))
            NMBE = 100 * (sum(test_labels - predictions) / (pd.Series(test_labels).count() * np.mean(test_labels)))
            CVRSME = 100 * ((sum((test_labels - predictions)**2) / (pd.Series(test_labels).count()-1))**(0.5)) / np.mean(test_labels)
            RSQUARED = r2_score(test_labels, predictions)

            MAPE_sum += MAPE
            NMBE_sum += NMBE
            CVRSME_sum += CVRSME
            RSQUARED_sum += RSQUARED
        
        print("MAPE: "+str(MAPE_sum/n_splits))
        print("NMBE: "+str(NMBE_sum/n_splits))
        print("CVRSME: "+str(CVRSME_sum/n_splits))
        print("R SQUARED: "+str(RSQUARED_sum/n_splits))
        
        MAPE_data[singlebuilding] = MAPE_sum / n_splits
        NMBE_data[singlebuilding] = NMBE_sum / n_splits
        CVRSME_data[singlebuilding] = CVRSME_sum / n_splits
        RSQUARED_data[singlebuilding] = RSQUARED_sum / n_splits
        
        metrics = pd.DataFrame([MAPE_data, NMBE_data, CVRSME_data, RSQUARED_data]).T
        metrics.columns = ["MAPE", "NMBE", "CVRSME", "RSQUARED"]
        metrics
        metrics.to_csv('../results-timeseries/' + modelName + '_metrics_cross_validation.csv')


In [None]:
MAPE_data = {}
RSQUARED_data = {}
NMBE_data = {}
CVRSME_data = {}
for elem in models:
    # modelName = elem[0], model = elem[1]
    createMetrics(elem[0], elem[1])
    



DummyRegressor
_____________
Modelling: Office_Cristina
Weatherfile: weather2.csv
MAPE: 36.212414024223534
NMBE: -9.677353937140435
CVRSME: 33.82465131497564
R SQUARED: -0.14337970081938872
Modelling: Office_Jesus
Weatherfile: weather1.csv
MAPE: 179.22145785231532
NMBE: 3.595851758033851
CVRSME: 39.63714423968816
R SQUARED: -0.5304647569816556
Modelling: Office_Jett
Weatherfile: weather1.csv
MAPE: 214.30086830725025
NMBE: -50.15843977473095
CVRSME: 90.54157139522513
R SQUARED: -1.507802715413983
Modelling: Office_Jerry
Weatherfile: weather1.csv
MAPE: 208.10324510388745
NMBE: -46.647150704215775
CVRSME: 68.23909245535694
R SQUARED: -4.985921360378656
Modelling: Office_Lesa
Weatherfile: weather5.csv
MAPE: 58.49768560941071
NMBE: -22.911943558953887
CVRSME: 58.42155117579231
R SQUARED: -2.8252341196368413
Modelling: Office_Jackie
Weatherfile: weather1.csv
MAPE: 490.08422928256294
NMBE: -22.248883275066095
CVRSME: 94.73146445247596
R SQUARED: -0.16797044579715797
Modelling: Office_Marla


MAPE: 37.5495748272625
NMBE: 15.784163232461642
CVRSME: 51.17346842370607
R SQUARED: -0.672797904277582
Modelling: Office_Mark
Weatherfile: weather3.csv
MAPE: 29.25201024140635
NMBE: 4.635383683341796
CVRSME: 35.75452254394979
R SQUARED: -0.07521190330981749
Modelling: Office_Travis
Weatherfile: weather8.csv
MAPE: 7.354136482553512
NMBE: -1.0755855659667233
CVRSME: 9.008328412035109
R SQUARED: -0.1796134488345547
Modelling: Office_Lena
Weatherfile: weather5.csv
MAPE: 64.82719962508232
NMBE: -36.38547996957094
CVRSME: 57.53388389883397
R SQUARED: -8.036653587881876
Modelling: Office_Max
Weatherfile: weather3.csv
MAPE: 40.23089094794386
NMBE: -3.6116700412636833
CVRSME: 38.938167820736055
R SQUARED: -0.029814482744594396
Modelling: Office_Gustavo
Weatherfile: weather5.csv
MAPE: 53.95565919350931
NMBE: -2.878425573980118
CVRSME: 33.24753718061716
R SQUARED: -0.023183653763375743
Modelling: Office_Penny
Weatherfile: weather4.csv
MAPE: 31.37082836019341
NMBE: -4.650576839427053
CVRSME: 32.9

MAPE: 49.294216068067065
NMBE: -19.279062380260054
CVRSME: 50.18382784442428
R SQUARED: -1.3816666621608684
Modelling: Office_Louise
Weatherfile: weather5.csv
MAPE: 47.40473675926323
NMBE: -28.583009499325517
CVRSME: 45.615209179513826
R SQUARED: -1.8894001952256214
Modelling: Office_Guillermo
Weatherfile: weather5.csv
MAPE: 39.2190030167835
NMBE: 1.0266840988574373
CVRSME: 39.04758982037564
R SQUARED: -0.018671123158486003
Modelling: Office_Paulina
Weatherfile: weather4.csv
MAPE: 12.4101229414122
NMBE: -4.461948312636734
CVRSME: 13.882229919178522
R SQUARED: -0.18921808519536637
Modelling: Office_Gabriela
Weatherfile: weather5.csv
MAPE: 27.879370833642717
NMBE: -16.08287070903649
CVRSME: 29.302775227531516
R SQUARED: -0.9974753684478375
Modelling: Office_Carolina
Weatherfile: weather2.csv
MAPE: 30.88134324123117
NMBE: -20.998057632152445
CVRSME: 31.179356907698395
R SQUARED: -1.8564359887882342
Modelling: Office_Noel
Weatherfile: weather9.csv
MAPE: 10.993921745964533
NMBE: 0.736450388

MAPE: 21.811214991049816
NMBE: -2.6100208112025247
CVRSME: 25.13141912122264
R SQUARED: 0.3642842931935625
Modelling: Office_Jesus
Weatherfile: weather1.csv
MAPE: 184.91725083828837
NMBE: 4.367435869821619
CVRSME: 34.90472721286486
R SQUARED: -0.23813620665046625
Modelling: Office_Jett
Weatherfile: weather1.csv
MAPE: 163.37295204604857
NMBE: -22.028873865425147
CVRSME: 75.4738936542332
R SQUARED: -0.6412670438432568
Modelling: Office_Jerry
Weatherfile: weather1.csv
MAPE: 154.80588519378176
NMBE: -22.535272421165065
CVRSME: 59.68533421893718
R SQUARED: -3.7289329662927253
Modelling: Office_Lesa
Weatherfile: weather5.csv
MAPE: 58.11758506503112
NMBE: -13.724460188470212
CVRSME: 67.99029508222807
R SQUARED: -4.321562210351184
Modelling: Office_Jackie
Weatherfile: weather1.csv
MAPE: 428.85296562195697
NMBE: -5.883793866164434
CVRSME: 62.002287947176384
R SQUARED: 0.4995910764784247
Modelling: Office_Marla
Weatherfile: weather3.csv
MAPE: 12.613859553060838
NMBE: -1.211172492853446
CVRSME: 1

MAPE: 11.064528210087998
NMBE: 3.929906561135829
CVRSME: 17.339194102260205
R SQUARED: 0.73848094655873
Modelling: Office_Travis
Weatherfile: weather8.csv
MAPE: 3.9953206685303178
NMBE: -0.03967397229173138
CVRSME: 5.984483683004845
R SQUARED: 0.4719534193234488
Modelling: Office_Lena
Weatherfile: weather5.csv
MAPE: 46.58198887592412
NMBE: -27.147193260784157
CVRSME: 49.5375290237012
R SQUARED: -6.958693259322786
Modelling: Office_Max
Weatherfile: weather3.csv
MAPE: 18.649249007839092
NMBE: 0.40109318447554454
CVRSME: 28.212499742665
R SQUARED: 0.45212376900189294
Modelling: Office_Gustavo
Weatherfile: weather5.csv
MAPE: 29.133301867188756
NMBE: 1.0815443472554436
CVRSME: 19.934401488559722
R SQUARED: 0.6149329729582631
Modelling: Office_Penny
Weatherfile: weather4.csv
MAPE: 14.721718473461866
NMBE: -2.5296851144068095
CVRSME: 21.222452871601238
R SQUARED: 0.5558816570369169
Modelling: Office_Garrett
Weatherfile: weather5.csv
MAPE: 204.35805822740926
NMBE: -0.23614032566517268
CVRSME: 

MAPE: 13.299596109376163
NMBE: 1.3177523732626242
CVRSME: 18.230547266467134
R SQUARED: 0.7762665227906625
Modelling: Office_Paulina
Weatherfile: weather4.csv
MAPE: 7.993799358960041
NMBE: -1.8573928121034893
CVRSME: 10.135091944117471
R SQUARED: 0.3464392694904972
Modelling: Office_Gabriela
Weatherfile: weather5.csv
MAPE: 14.739525448367266
NMBE: -7.991886778174137
CVRSME: 18.101988086214185
R SQUARED: 0.19280454685446588
Modelling: Office_Carolina
Weatherfile: weather2.csv
MAPE: 14.261121552494407
NMBE: -4.049263365201013
CVRSME: 18.806165591745014
R SQUARED: 0.06561174607125395
Modelling: Office_Noel
Weatherfile: weather9.csv
MAPE: 6.053344236911403
NMBE: 1.3752046406873548
CVRSME: 8.200598842119232
R SQUARED: 0.6148879747254391
Modelling: Office_Alannah
Weatherfile: weather0.csv
MAPE: 27.139039667280837
NMBE: 1.0449458819737416
CVRSME: 30.17223195931959
R SQUARED: 0.5410846233652706
Modelling: Office_Aliyah
Weatherfile: weather0.csv
MAPE: 6.171747854397006
NMBE: -1.339255190415368
