In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
energy_0 = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz')
energy_0 = energy_0[energy_0['meter_reading']>0]
energy_0['timestamp'] = pd.to_datetime(energy_0['timestamp'])
energy_0['hour'] = energy_0['timestamp'].dt.hour
print(energy_0.head())

      building_id  meter           timestamp  meter_reading  hour
704             0      0 2016-01-30 08:00:00        43.6839     8
725             0      0 2016-01-31 05:00:00        37.5408     5
737             0      0 2016-01-31 17:00:00        52.5571    17
2366            0      0 2016-04-08 14:00:00        59.3827    14
2923            0      0 2016-05-01 19:00:00       448.0000    19


In [5]:
energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2)
print(energy_0_train.head())

      building_id  meter           timestamp  meter_reading  hour
7479            0      0 2016-11-07 15:00:00        174.735    15
6843            0      0 2016-10-12 03:00:00        274.389     3
4992            0      0 2016-07-27 00:00:00        311.248     0
5444            0      0 2016-08-14 20:00:00        309.200    20
4115            0      0 2016-06-20 11:00:00        251.865    11


In [9]:
energy_0_train_hours = energy_0_train.groupby('hour')
energy_0_train_averages = pd.DataFrame(
    {'Average': energy_0_train_hours.mean()['meter_reading'],
    'Median': energy_0_train_hours.median()['meter_reading']}
)
print(energy_0_train_averages)

         Average    Median
hour                      
0     238.911018  239.5790
1     241.060805  240.9440
2     235.922109  240.2610
3     237.228865  243.6740
4     239.553122  245.0390
5     240.168124  246.0630
6     237.033603  243.6740
7     238.986132  245.0390
8     243.761231  238.5550
9     233.997034  234.1180
10    232.980552  238.2140
11    237.947219  242.3090
12    235.682894  242.3090
13    236.645099  244.0155
14    234.883149  241.2850
15    238.326671  243.6740
16    237.138328  242.9910
17    233.269825  239.5790
18    237.233520  239.5790
19    238.693770  238.2140
20    235.623598  237.1895
21    235.149713  236.8480
22    237.320103  236.8480
23    239.732882  240.9440


In [10]:
def calculate_model(x):
    meter_reading_log = np.log(x.meter_reading + 1)
    meter_reading_mean = np.log(energy_0_train_averages['Average'][x.hour] + 1)
    meter_reading_median = np.log(energy_0_train_averages['Median'][x.hour] + 1)
    x['meter_reading_mean_q'] = (meter_reading_log - meter_reading_mean)**2
    x['meter_reading_median_q'] = (meter_reading_log - meter_reading_median)**2
    x['meter_reading_zero_q'] = (meter_reading_log)**2
    return x

energy_0_test = energy_0_test.apply(calculate_model, axis=1, result_type='expand')
print(energy_0_test.head())

      building_id  meter           timestamp  meter_reading  hour  \
7251            0      0 2016-10-29 03:00:00        212.959     3   
4414            0      0 2016-07-02 22:00:00        250.500    22   
7741            0      0 2016-11-18 13:00:00        177.466    13   
4727            0      0 2016-07-15 23:00:00        236.848    23   
3616            0      0 2016-05-30 16:00:00        177.466    16   

      meter_reading_mean_q  meter_reading_median_q  meter_reading_zero_q  
7251              0.011545                0.017994             28.791642  
4414              0.002897                0.003115             30.552626  
7741              0.082014                0.100440             26.877984  
4727              0.000145                0.000292             29.938755  
3616              0.083205                0.097802             26.877984  


In [13]:
energy_0_test_mean_rmsle = np.sqrt(energy_0_test['meter_reading_mean_q'].sum() / len(energy_0_test))
energy_0_test_median_rmsle = np.sqrt(energy_0_test['meter_reading_median_q'].sum() / len(energy_0_test))
energy_0_test_zero_rmsle = np.sqrt(energy_0_test['meter_reading_zero_q'].sum() / len(energy_0_test))
print('Average quality:', energy_0_test_mean_rmsle)
print('Median quality:', energy_0_test_median_rmsle)
print('Zero quality:', energy_0_test_zero_rmsle)

Average quality: 0.2440326816835114
Median quality: 0.2453967811241285
Zero quality: 5.461812550492672
