# Metrics for prediction model based on 1h resolution

This notebook contains metric definition for Prediction model.
Model score is calculated using the following formula:
  * For each day in year 2017
    * Build model based on data before given day
    * Predict given day
    * Calculate prediction error for given day
  * Report 95th percentile as model score

In [1]:
import datetime
import calendar
import time
import json
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

Load rainfall and flow data from the files and clean it by:
  * Resampling to 5 minutes
  * Slice to the common range
  * Fill NaNs

In [2]:
PROJECT_FOLDER = '../../datasets/radon-medium/'

flow = pd.read_csv(PROJECT_FOLDER + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time').flow
flow = flow.resample('1H').mean()

rainfall = pd.read_csv(PROJECT_FOLDER + 'rainfall1.csv', parse_dates=['time'])
rainfall = rainfall.set_index('time').rainfall
rainfall = rainfall.resample('1H').mean()

data_frame = pd.concat([flow, rainfall], axis=1).fillna(0)
data_frame['day'] = data_frame.index.map(lambda x: x.date())
data_frame = data_frame['2015-01-01':'2018-07-05']
print(data_frame.isna().sum())
print(data_frame.head())
print(data_frame.tail())

flow        0
rainfall    0
day         0
dtype: int64
                          flow  rainfall         day
time                                                
2015-01-01 00:00:00  76.796188       0.0  2015-01-01
2015-01-01 01:00:00  71.892892       0.0  2015-01-01
2015-01-01 02:00:00  63.906876       0.0  2015-01-01
2015-01-01 03:00:00  60.286973       0.0  2015-01-01
2015-01-01 04:00:00  57.049687       0.0  2015-01-01
                          flow  rainfall         day
time                                                
2018-07-05 19:00:00  96.729522       0.0  2018-07-05
2018-07-05 20:00:00  99.925573       0.0  2018-07-05
2018-07-05 21:00:00  98.718231       0.0  2018-07-05
2018-07-05 22:00:00  87.898124       0.0  2018-07-05
2018-07-05 23:00:00  78.218643       0.0  2018-07-05


## Prepare functions for calculating model score

The basic prediction model uses daily pattern as a prediction

In [3]:
class PredictionModel:

    def fit(self, flow, rain):
        pass
        
    def predict(self, day, rain):
        return np.zeros(24)

    
def loss(y_hat, y):
    """
    https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
    """
    return 100.0 * np.sum(np.abs((y-y_hat) / y)) / y.shape[0]


def split_data(flow, split_day):
    """Get all data up to given day"""
    end_day = split_day - pd.Timedelta('1 min')
    return flow[:end_day]


def evaluate_day(model, flow, rain, day):
    """Evaluate data for single day"""
    xs = split_data(flow, day)
    y = flow[day: day+pd.Timedelta('1439 min')]
    model.fit(xs, rain)
    y_hat = model.predict(day, rain)
    return loss(y_hat, y)


def evaluate_model(model, flow, rain, start_day):
    """
    Evaluate model on all days starting from the split_day.
    Returns 95th percentile error as model score
    """
    last_day = flow.index[-1] - pd.Timedelta(1, 'D')
    split_day = start_day
    costs = []
    while split_day < last_day:
        cost = evaluate_day(model, flow, rain, split_day)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 95), costs


error = evaluate_day(PredictionModel(), flow, rainfall, pd.Timestamp('2017-11-10'))
print('Error: {:.2f}%'.format(error))

Error: 100.00%


# Evaluate some models for year 2018

## Mean model

In [4]:
class MeanModel:

    def fit(self, flow, rain):
        self.mean = np.mean(flow.values)
        
    def predict(self, day, rain):
        return np.ones(24) * self.mean

    
start_time = time.time()
score, costs = evaluate_model(MeanModel(), data_frame.flow, data_frame.rainfall, pd.Timestamp('2018-01-01'))
print('MeanModel 95th percentile error: {:.2f}%'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))

MeanModel 95th percentile error: inf%
Calculated in 0.182 seconds


## Daily pattern model

In [5]:
class DailyPatternModel:
    
    def fit(self, flow, rain):
        """ Use daily pattern """
        df = flow.to_frame().reset_index()
        self.daily_pattern = df.groupby(by=[df.time.map(lambda x : x.hour)]).flow.mean().values
        
    def predict(self, day, rain):
        return self.daily_pattern
    
    
start_time = time.time()    
score, costs = evaluate_model(DailyPatternModel(), data_frame.flow, data_frame.rainfall, pd.Timestamp('2017-01-01'))
print('DailyPatternModel 95th percentile error: {:.2f}%'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))

DailyPatternModel 95th percentile error: 15.85%
Calculated in 23.864 seconds


## Last day model

In [6]:
class LastDayModel(PredictionModel):
    
    def fit(self, flow, rain):
        self.y = flow.values[-24:]
        
    def predict(self, day, rain):
        return self.y
    
score, costs = evaluate_model(LastDayModel(), data_frame.flow, data_frame.rainfall, pd.Timestamp('2017-01-01'))
print('LastDayModel 95th percentile error: {:.2f}%'.format(score))    

LastDayModel 95th percentile error: 16.57%
