### Model evaluation

In [1]:
# Loading libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Loading data

energy_0 = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz")
energy_0 = energy_0[energy_0["meter_reading"]>0]
energy_0["timestamp"] = pd.to_datetime(energy_0["timestamp"])
energy_0["hour"] = energy_0["timestamp"].dt.hour
print(energy_0.head())

      building_id  meter           timestamp  meter_reading  hour
704             0      0 2016-01-30 08:00:00        43.6839     8
725             0      0 2016-01-31 05:00:00        37.5408     5
737             0      0 2016-01-31 17:00:00        52.5571    17
2366            0      0 2016-04-08 14:00:00        59.3827    14
2923            0      0 2016-05-01 19:00:00       448.0000    19


In [3]:
# Splitting dataset to train and test sets

energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2)
print(energy_0_train.head())

      building_id  meter           timestamp  meter_reading  hour
5517            0      0 2016-08-17 21:00:00        311.248    21
7734            0      0 2016-11-18 06:00:00        176.101     6
6951            0      0 2016-10-16 15:00:00        247.769    15
6248            0      0 2016-09-17 08:00:00        239.579     8
4330            0      0 2016-06-29 10:00:00        240.944    10


In [5]:
# Finding the average and median energy consumption by hour

energy_0_train_hours = energy_0_train.groupby("hour")
energy_0_train_averages = pd.DataFrame(
    {"Average": energy_0_train_hours.mean()["meter_reading"],
     "Median": energy_0_train_hours.median()["meter_reading"]}
)
print(energy_0_train_averages.head())

         Average   Median
hour                     
0     237.100087  239.579
1     237.706186  239.579
2     238.361611  242.309
3     239.462417  244.357
4     240.552162  245.722


#### model checking function

\begin{equation}
RMSLE = {\sqrt{\sum_{i=1}^{n}{(log(p_i+1) - log(a_i+1))^2} \over n}}.
\end{equation}

In [9]:
def calculate_model(x):
    meter_reading_log = np.log(x.meter_reading + 1)
    meter_reading_mean = np.log(energy_0_train_averages["Average"][x.hour] + 1)
    meter_reading_median = np.log(energy_0_train_averages["Median"][x.hour] + 1)
    x["meter_reading_mean_q"] = (meter_reading_log - meter_reading_mean) ** 2
    x["meter_reading_median_q"] = (meter_reading_log - meter_reading_median) ** 2
    x["meter_reading_zero_q"] = (meter_reading_log) ** 2
    return x

energy_0_test = energy_0_test.apply(calculate_model, axis=1, result_type="expand")
print(energy_0_test.head())

      building_id  meter           timestamp  meter_reading  hour  \
7925            0      0 2016-11-26 05:00:00        236.166     5   
6036            0      0 2016-09-08 12:00:00        246.404    12   
4721            0      0 2016-07-15 17:00:00        297.596    17   
3413            0      0 2016-05-22 05:00:00        251.182     5   
6373            0      0 2016-09-22 13:00:00        255.278    13   

      meter_reading_mean_q  meter_reading_median_q  meter_reading_zero_q  
7925              0.000021                0.001349             29.907339  
6036              0.001371                0.000123             30.371370  
4721              0.060009                0.046675             32.479644  
3413              0.003232                0.000608             30.582571  
6373              0.004022                0.001896             30.761031  


In [10]:
energy_0_test_mean_rmsle = np.sqrt(energy_0_test["meter_reading_mean_q"].sum() / len(energy_0_test))
energy_0_test_median_rmsle = np.sqrt(energy_0_test["meter_reading_median_q"].sum() / len(energy_0_test))
energy_0_test_zero_rmsle = np.sqrt(energy_0_test["meter_reading_zero_q"].sum() / len(energy_0_test))

print("Median Quality: ", energy_0_test_median_rmsle)
print("Average Quality: ", energy_0_test_mean_rmsle)
print("Zro Quality: ", energy_0_test_zero_rmsle)

Median Quality:  0.2593341745598782
Average Quality:  0.2575585541928154
Zro Quality:  5.455875927381129
