In [33]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit

In [34]:
meta = pd.read_csv("../input/meta_open.csv", index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)
temporal = pd.read_csv("../input/temp_open_utc_complete.csv", index_col='timestamp', parse_dates=True).tz_localize('utc')

In [35]:
buildingnames = temporal.columns[temporal.columns.str.contains("Office")]

In [36]:
# Import all models we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
from  sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import TheilSenRegressor
    
# Make array of models. Each model is an array of two elements.
# First element is a model-name, second is a model itself
models = [['RandomForestRegressor', RandomForestRegressor(n_estimators = 1000, random_state = 42)],
['AdaBoostRegressor', AdaBoostRegressor(n_estimators = 1000, random_state = 42)],
['BaggingRegressor', BaggingRegressor(n_estimators = 1000, random_state = 42)],
['DecisionTreeRegressor', DecisionTreeRegressor(random_state = 42)],
['DummyRegressor', DummyRegressor()],
['ExtraTreeRegressor', ExtraTreeRegressor(random_state = 42)],
['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators = 1000, random_state = 42)],
['GaussianProcessRegressor', GaussianProcessRegressor(random_state = 42)],
['GradientBoostingRegressor', GradientBoostingRegressor(n_estimators = 1000, random_state = 42)],
['HuberRegressor', HuberRegressor()],
['KNeighborsRegressor', KNeighborsRegressor()],
['MLPRegressor', MLPRegressor(random_state = 42)],
['PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state = 42)],
['RANSACRegressor', RANSACRegressor(random_state = 42)],
['SGDRegressor', SGDRegressor(random_state = 42)],
['TheilSenRegressor', TheilSenRegressor(random_state = 42)]]

In [41]:
# Produce file with metrics(MAPE, NMBE, CVRSME, RSQUARED) based on provided model
# Results will be saved as modelName_metrics.csv
def createMetrics(modelName, model):
    print('\n\n' + modelName + '\n_____________')
    for singlebuilding in buildingnames[:]:
        print("Modelling: " + singlebuilding)
        # Get Data
        single_timezone = meta.T[singlebuilding].timezone
        single_start = meta.T[singlebuilding].datastart
        single_end = meta.T[singlebuilding].dataend
        single_building_data = pd.DataFrame(temporal[singlebuilding].tz_convert(single_timezone).truncate(before=single_start,after=single_end))
        
        # split time series data samples
        months = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
        tscv = TimeSeriesSplit(n_splits=5)
        for train_index, test_index in tscv.split(months):
            month_train, month_test = months[train_index], months[test_index]
            
            # Split into Training and Testing
            trainingdata = single_building_data[single_building_data.index.month.isin(month_train)]
            testdata = single_building_data[single_building_data.index.month.isin(month_test)]

            # Get weather file
            weatherfilename = meta.T[singlebuilding].newweatherfilename
            print("Weatherfile: "+weatherfilename)
            weather = pd.read_csv(os.path.join("../input/",weatherfilename),index_col='timestamp', parse_dates=True, na_values='-9999')
            weather = weather.tz_localize(single_timezone, ambiguous = 'infer')
            outdoor_temp = pd.DataFrame(weather[[col for col in weather.columns if 'Temperature' in col]]).resample("H").mean()
            outdoor_temp = outdoor_temp.reindex(pd.DatetimeIndex(start=outdoor_temp.index[0], periods=len(single_building_data), freq="H")).fillna(method='ffill').fillna(method='bfill')

            # Create training data array
            train_features = np.array(pd.concat([pd.get_dummies(trainingdata.index.hour),
                                                 pd.get_dummies(trainingdata.index.dayofweek),
                       pd.Series(outdoor_temp[outdoor_temp.index.month.isin(month_train)].TemperatureC.values)], axis=1))
            train_labels = np.array(trainingdata[singlebuilding].values)

            # Create test data array
            test_features = np.array(pd.concat([pd.get_dummies(testdata.index.hour),
                                                 pd.get_dummies(testdata.index.dayofweek),
                       pd.Series(outdoor_temp[outdoor_temp.index.month.isin(month_test)].TemperatureC.values)], axis=1))
            test_labels = np.array(testdata[singlebuilding].values)


            # Train the model on training data
            model.fit(train_features, train_labels);
            # Use the forest's predict method on the test data
            predictions = model.predict(test_features)

            # Calculate the absolute errors
            errors = abs(predictions - test_labels)
            # Calculate mean absolute percentage error (MAPE) and add to list
            MAPE = 100 * np.mean((errors / test_labels))
            NMBE = 100 * (sum(test_labels - predictions) / (pd.Series(test_labels).count() * np.mean(test_labels)))
            CVRSME = 100 * ((sum((test_labels - predictions)**2) / (pd.Series(test_labels).count()-1))**(0.5)) / np.mean(test_labels)
            RSQUARED = r2_score(test_labels, predictions)

            print("MAPE: "+str(MAPE))
            print("NMBE: "+str(NMBE))
            print("CVRSME: "+str(CVRSME))
            print("R SQUARED: "+str(RSQUARED))

            MAPE_data[singlebuilding] = MAPE
            NMBE_data[singlebuilding] = NMBE
            CVRSME_data[singlebuilding] = CVRSME
            RSQUARED_data[singlebuilding] = RSQUARED

            #metrics = pd.DataFrame([MAPE_data, NMBE_data, CVRSME_data, RSQUARED_data]).T
            #metrics.columns = ["MAPE", "NMBE", "CVRSME", "RSQUARED"]
            #metrics
            #metrics.to_csv('../results-timeseries/' + modelName + '_metrics.csv')


In [None]:
MAPE_data = {}
RSQUARED_data = {}
NMBE_data = {}
CVRSME_data = {}
for elem in models:
    # modelName = elem[0], model = elem[1]
    createMetrics(elem[0], elem[1])
    



RandomForestRegressor
_____________
Modelling: Office_Cristina
TRAIN: [0 1] TEST: [2 3]
x: 0
[1 2]
Weatherfile: weather2.csv
MAPE: 18.597636079772805
NMBE: -3.626298944257014
CVRSME: 20.105609979844484
R SQUARED: 0.5908484641421823
TRAIN: [0 1 2 3] TEST: [4 5]
x: 0
[1 2 3 4]
Weatherfile: weather2.csv
MAPE: 19.445369460358812
NMBE: -7.758875382057207
CVRSME: 20.46892970073604
R SQUARED: 0.618678615256028
TRAIN: [0 1 2 3 4 5] TEST: [6 7]
x: 0
[1 2 3 4 5 6]
Weatherfile: weather2.csv
MAPE: 18.00639325725745
NMBE: 1.2399987848923193
CVRSME: 21.812694063060516
R SQUARED: 0.5619868073635068
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]
x: 0
[1 2 3 4 5 6 7 8]
Weatherfile: weather2.csv
MAPE: 17.494028277148413
NMBE: 1.576627038620565
CVRSME: 20.206868203784833
R SQUARED: 0.571839079303199
TRAIN: [0 1 2 3 4 5 6 7 8 9] TEST: [10 11]
x: 0
[ 1  2  3  4  5  6  7  8  9 10]
Weatherfile: weather2.csv
