In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit

In [2]:
meta = pd.read_csv("../input/meta_open.csv", index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)
temporal = pd.read_csv("../input/temp_open_utc_complete.csv", index_col='timestamp', parse_dates=True).tz_localize('utc')

In [3]:
buildingnames = temporal.columns[temporal.columns.str.contains("Office")]

In [4]:
# Import all models we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
from  sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import TheilSenRegressor
    
# Make array of models. Each model is an array of two elements.
# First element is a model-name, second is a model itself
models = [#['RandomForestRegressor', RandomForestRegressor(n_estimators = 1000, random_state = 42)],
#['AdaBoostRegressor', AdaBoostRegressor(n_estimators = 1000, random_state = 42)],
#['BaggingRegressor', BaggingRegressor(n_estimators = 1000, random_state = 42)],
#['DecisionTreeRegressor', DecisionTreeRegressor(random_state = 42)],
['DummyRegressor', DummyRegressor()],
['ExtraTreeRegressor', ExtraTreeRegressor(random_state = 42)],
['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators = 1000, random_state = 42)],
['GaussianProcessRegressor', GaussianProcessRegressor(random_state = 42)],
['GradientBoostingRegressor', GradientBoostingRegressor(n_estimators = 1000, random_state = 42)],
['HuberRegressor', HuberRegressor()],
['KNeighborsRegressor', KNeighborsRegressor()],
['MLPRegressor', MLPRegressor(random_state = 42)],
['PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state = 42)],
['RANSACRegressor', RANSACRegressor(random_state = 42)],
['SGDRegressor', SGDRegressor(random_state = 42)],
['TheilSenRegressor', TheilSenRegressor(random_state = 42)]]

  from numpy.core.umath_tests import inner1d


In [5]:
# Produce file with metrics(MAPE, NMBE, CVRSME, RSQUARED) based on provided model
# Results will be saved as modelName_metrics.csv
def createMetrics(modelName, model):
    print('\n\n' + modelName + '\n_____________')
    for singlebuilding in buildingnames[:]:
        print("Modelling: " + singlebuilding)
        # Get Data
        single_timezone = meta.T[singlebuilding].timezone
        single_start = meta.T[singlebuilding].datastart
        single_end = meta.T[singlebuilding].dataend
        single_building_data = pd.DataFrame(temporal[singlebuilding].tz_convert(single_timezone).truncate(before=single_start,after=single_end))
        
        # set to zero, we will calculate average in the end
        MAPE_sum = 0
        NMBE_sum = 0
        CVRSME_sum = 0
        RSQUARED_sum = 0
            
        # split time series data samples
        months = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
        n_splits = 5
        tscv = TimeSeriesSplit(n_splits=n_splits)
        for train_index, test_index in tscv.split(months):
            month_train, month_test = months[train_index], months[test_index]
            
            # Split into Training and Testing
            trainingdata = single_building_data[single_building_data.index.month.isin(month_train)]
            testdata = single_building_data[single_building_data.index.month.isin(month_test)]

            # Get weather file
            weatherfilename = meta.T[singlebuilding].newweatherfilename
            print("Weatherfile: "+weatherfilename)
            weather = pd.read_csv(os.path.join("../input/",weatherfilename),index_col='timestamp', parse_dates=True, na_values='-9999')
            weather = weather.tz_localize(single_timezone, ambiguous = 'infer')
            outdoor_temp = pd.DataFrame(weather[[col for col in weather.columns if 'Temperature' in col]]).resample("H").mean()
            outdoor_temp = outdoor_temp.reindex(pd.DatetimeIndex(start=outdoor_temp.index[0], periods=len(single_building_data), freq="H")).fillna(method='ffill').fillna(method='bfill')

            # Create training data array
            train_features = np.array(pd.concat([pd.get_dummies(trainingdata.index.hour),
                                                 pd.get_dummies(trainingdata.index.dayofweek),
                       pd.Series(outdoor_temp[outdoor_temp.index.month.isin(month_train)].TemperatureC.values)], axis=1))
            train_labels = np.array(trainingdata[singlebuilding].values)

            # Create test data array
            test_features = np.array(pd.concat([pd.get_dummies(testdata.index.hour),
                                                 pd.get_dummies(testdata.index.dayofweek),
                       pd.Series(outdoor_temp[outdoor_temp.index.month.isin(month_test)].TemperatureC.values)], axis=1))
            test_labels = np.array(testdata[singlebuilding].values)


            # Train the model on training data
            model.fit(train_features, train_labels);
            # Use the forest's predict method on the test data
            predictions = model.predict(test_features)

            # Calculate the absolute errors
            errors = abs(predictions - test_labels)
            # Calculate mean absolute percentage error (MAPE) and add to list
            MAPE = 100 * np.mean((errors / test_labels))
            NMBE = 100 * (sum(test_labels - predictions) / (pd.Series(test_labels).count() * np.mean(test_labels)))
            CVRSME = 100 * ((sum((test_labels - predictions)**2) / (pd.Series(test_labels).count()-1))**(0.5)) / np.mean(test_labels)
            RSQUARED = r2_score(test_labels, predictions)

            MAPE_sum += MAPE
            NMBE_sum += NMBE
            CVRSME_sum += CVRSME
            RSQUARED_sum += RSQUARED
        
        print("MAPE: "+str(MAPE_sum/n_splits))
        print("NMBE: "+str(NMBE_sum/n_splits))
        print("CVRSME: "+str(CVRSME_sum/n_splits))
        print("R SQUARED: "+str(RSQUARED_sum/n_splits))
        
        MAPE_data[singlebuilding] = MAPE_sum
        NMBE_data[singlebuilding] = NMBE_sum
        CVRSME_data[singlebuilding] = CVRSME_sum
        RSQUARED_data[singlebuilding] = RSQUARED_sum
        
        metrics = pd.DataFrame([MAPE_data, NMBE_data, CVRSME_data, RSQUARED_data]).T
        metrics.columns = ["MAPE", "NMBE", "CVRSME", "RSQUARED"]
        metrics
        metrics.to_csv('../results-timeseries/' + modelName + '_metrics.csv')


In [None]:
MAPE_data = {}
RSQUARED_data = {}
NMBE_data = {}
CVRSME_data = {}
for elem in models:
    # modelName = elem[0], model = elem[1]
    createMetrics(elem[0], elem[1])
    



DummyRegressor
_____________
Modelling: Office_Cristina
Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
MAPE: 36.212414024223534
NMBE: -9.677353937140435
CVRSME: 33.82465131497564
R SQUARED: -0.14337970081938872
Modelling: Office_Jesus
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
MAPE: 179.22145785231532
NMBE: 3.595851758033851
CVRSME: 39.63714423968816
R SQUARED: -0.5304647569816556
Modelling: Office_Jett
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
MAPE: 214.30086830725025
NMBE: -50.15843977473095
CVRSME: 90.54157139522513
R SQUARED: -1.507802715413983
Modelling: Office_Jerry
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
Weatherfile: weather1.csv
MAPE: 208.10324510388745
NMBE: 

Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
MAPE: 38.5569278023778
NMBE: -2.373959819828947
CVRSME: 42.24229206833132
R SQUARED: -0.007751507255874213
Modelling: Office_Scottie
Weatherfile: weather6.csv
Weatherfile: weather6.csv
Weatherfile: weather6.csv
Weatherfile: weather6.csv
Weatherfile: weather6.csv
MAPE: 39.934972669046424
NMBE: -0.9297078412600657
CVRSME: 43.89478962433446
R SQUARED: -0.005645459485162707
Modelling: Office_Cecelia
Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
Weatherfile: weather2.csv
MAPE: 56.05634798638747
NMBE: -29.886669892833197
CVRSME: 50.42569754734223
R SQUARED: -1.4339263406573777
Modelling: Office_Nelson
Weatherfile: weather9.csv
Weatherfile: weather9.csv
Weatherfile: weather9.csv
Weatherfile: weather9.csv
Weatherfile: weather9.csv
MAPE: 30.819490073991595
NMBE: -1.9238940769102257
CVRSME: 31.284817465286608
R SQUARED: -0.0167901417523