In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import r2_score

In [3]:
meta = pd.read_csv("../input/meta_open.csv", index_col='uid', parse_dates=["datastart","dataend"], dayfirst=True)
temporal = pd.read_csv("../input/temp_open_utc_complete.csv", index_col='timestamp', parse_dates=True).tz_localize('utc')

In [4]:
buildingnames = temporal.columns[temporal.columns.str.contains("Office")]

In [5]:
buildingnames

Index(['Office_Cristina', 'Office_Jesus', 'Office_Jett', 'Office_Jerry',
       'Office_Lesa', 'Office_Jackie', 'Office_Marla', 'Office_Maryann',
       'Office_Myron', 'Office_Conrad',
       ...
       'Office_Ellie', 'Office_Erik', 'Office_Evelyn', 'Office_Emer',
       'Office_Elena', 'Office_Emerald', 'Office_Ellis', 'Office_Elliot',
       'Office_Eddie', 'Office_Georgia'],
      dtype='object', length=156)

In [9]:
MAPE_data = {}
RSQUARED_data = {}
NMBE_data = {}
CVRSME_data = {}

for singlebuilding in buildingnames[:2]:
    print("Modelling: "+singlebuilding)
#     try:
    # Get Data
    single_timezone = meta.T[singlebuilding].timezone
    single_start = meta.T[singlebuilding].datastart
    single_end = meta.T[singlebuilding].dataend
    single_building_data = pd.DataFrame(temporal[singlebuilding].tz_convert(single_timezone).truncate(before=single_start,after=single_end))

    # Split into Training and Testing
    trainingdata = single_building_data[single_building_data.index.month.isin(["1","2","3","5","6","7","9","10","11"])]
    testdata = single_building_data[single_building_data.index.month.isin(["4","8","12"])]

    # Get weather file
    weatherfilename = meta.T[singlebuilding].newweatherfilename
    print("Weatherfile: "+weatherfilename)
    weather = pd.read_csv(os.path.join("../input/",weatherfilename),index_col='timestamp', parse_dates=True, na_values='-9999')
    weather = weather.tz_localize(single_timezone, ambiguous = 'infer')
    outdoor_temp = pd.DataFrame(weather[[col for col in weather.columns if 'Temperature' in col]]).resample("H").mean()
    outdoor_temp = outdoor_temp.reindex(pd.DatetimeIndex(start=outdoor_temp.index[0], periods=len(single_building_data), freq="H")).fillna(method='ffill').fillna(method='bfill')

    # Create training data array
    train_features = np.array(pd.concat([pd.get_dummies(trainingdata.index.hour),
                                         pd.get_dummies(trainingdata.index.dayofweek),
               pd.Series(outdoor_temp[outdoor_temp.index.month.isin(["1","2","3","5","6","7","9","10","11"])].TemperatureC.values)], axis=1))
    train_labels = np.array(trainingdata[singlebuilding].values)

    # Create test data array
    test_features = np.array(pd.concat([pd.get_dummies(testdata.index.hour),
                                         pd.get_dummies(testdata.index.dayofweek),
               pd.Series(outdoor_temp[outdoor_temp.index.month.isin(["4","8","12"])].TemperatureC.values)], axis=1))
    test_labels = np.array(testdata[singlebuilding].values)
    
    # Import the model we are using
    from sklearn.neighbors import RadiusNeighborsRegressor
    # Make model
    model = RadiusNeighborsRegressor()
    # Train the model on training data
    model.fit(train_features, train_labels);
    # Use the forest's predict method on the test data
    predictions = model.predict(test_features)
    
    # Calculate the absolute errors
    errors = abs(predictions - test_labels)

    # Calculate mean absolute percentage error (MAPE) and add to list
    MAPE = 100 * np.mean((errors / test_labels))
    NMBE = 100 * (sum(test_labels - predictions) / (pd.Series(test_labels).count() * np.mean(test_labels)))
    CVRSME = 100 * ((sum((test_labels - predictions)**2) / (pd.Series(test_labels).count()-1))**(0.5)) / np.mean(test_labels)
    RSQUARED = r2_score(test_labels, predictions)

    print("MAPE: "+str(MAPE))
    print("NMBE: "+str(NMBE))
    print("CVRSME: "+str(CVRSME))
    print("R SQUARED: "+str(RSQUARED))

    MAPE_data[singlebuilding] = MAPE
    NMBE_data[singlebuilding] = NMBE
    CVRSME_data[singlebuilding] = CVRSME
    RSQUARED_data[singlebuilding] = RSQUARED
        
#     except:
#         print("There was a problem")


Modelling: Office_Cristina
Weatherfile: weather2.csv


  out=out, **kwargs)
  ret, rcount, out=ret, casting='unsafe', subok=False)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [11]:
metrics = pd.DataFrame([MAPE_data, NMBE_data, CVRSME_data, RSQUARED_data]).T
metrics.columns = ["MAPE", "NMBE", "CVRSME", "RSQUARED"]

In [12]:
metrics

Unnamed: 0,MAPE,NMBE,CVRSME,RSQUARED
Office_Cristina,27.086057,1.016622,28.589987,0.18218
Office_Jesus,143.986978,4.097142,31.455292,0.330548


In [13]:
metrics.to_csv("RF_metrics.csv")