# Metrics for prediction model based on 1h resolution

In [1]:
import datetime
import calendar
import time
import json
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

Load rainfall and flow data from the files and clean it by:
  * Resampling to 5 minutes
  * Slice to the common range
  * Remove NaNs

In [57]:
PROJECT_FOLDER = '../../datasets/thorium-small/'

flow = pd.read_csv(PROJECT_FOLDER + 'flow1.csv', parse_dates=['time'])
data_frame = flow.set_index('time').resample('1H').mean()
data_frame['hour'] = data_frame.index.map(lambda x: x.time().hour)
data_frame['day'] = data_frame.index.map(lambda x: x.date())
print(data_frame.head())
print(data_frame.tail())
print('Number of NaNs: {:}'.format(sum(data_frame.flow.isna())))
data_frame = data_frame.dropna()

                           flow  hour         day
time                                             
2015-06-01 14:00:00  113.754443    14  2015-06-01
2015-06-01 15:00:00  113.013333    15  2015-06-01
2015-06-01 16:00:00  114.081666    16  2015-06-01
2015-06-01 17:00:00  116.459167    17  2015-06-01
2015-06-01 18:00:00  123.825000    18  2015-06-01
                           flow  hour         day
time                                             
2017-11-10 10:00:00  120.655001    10  2017-11-10
2017-11-10 11:00:00  115.167501    11  2017-11-10
2017-11-10 12:00:00  111.554167    12  2017-11-10
2017-11-10 13:00:00  109.408332    13  2017-11-10
2017-11-10 14:00:00  107.245555    14  2017-11-10
Number of NaNs: 138


## Prepare functions for calculating model score

The basic prediction model uses daily pattern as a prediction

In [40]:
def loss(y_hat, y):
    """
    https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
    """
    return 100.0 * np.sum(np.abs((y-y_hat) / y)) / y.shape[0]


err = loss(np.array([1,1,1]), np.array([2,2,2]))
print("loss 50.0 == {}".format(err))

loss 50.0 == 50.0


# Evaluate dataset


Use all data before 2018-01-01 as training set and later data as test set

In [58]:
train_data = data_frame[data_frame.day < datetime.date(2017,1,1)]
test_data = data_frame[data_frame.day >= datetime.date(2017,1,1)]
print(train_data.tail())
print(test_data.head())
x_train = train_data[['hour']]
y_train = train_data['flow']
x_test = test_data[['hour']]
y_test = test_data['flow']

                           flow  hour         day
time                                             
2016-12-31 19:00:00  126.925001    19  2016-12-31
2016-12-31 20:00:00  120.966667    20  2016-12-31
2016-12-31 21:00:00  114.404166    21  2016-12-31
2016-12-31 22:00:00  107.116667    22  2016-12-31
2016-12-31 23:00:00   99.808334    23  2016-12-31
                          flow  hour         day
time                                            
2017-01-01 00:00:00  94.330833     0  2017-01-01
2017-01-01 01:00:00  91.927501     1  2017-01-01
2017-01-01 02:00:00  90.722501     2  2017-01-01
2017-01-01 03:00:00  87.402501     3  2017-01-01
2017-01-01 04:00:00  81.485833     4  2017-01-01


# Test mean model

In [59]:
mu = y_train.mean()
y_hat = np.ones(len(y_test)) * mu
loss(y_hat, y_test)

17.7263878591571

# Test TreeRegression

In [60]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()
regressor.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [61]:
y_hat = regressor.predict(x_test)
loss(y_hat, y_test)

8.8283833611034872