# Metrics for prediction model

This notebook contains metric definition for Prediction model.
Model score is calculated using the following formula:
  * For each day in year 2017
    * Build model based on data before given day
    * Predict given day
    * Calculate prediction error for given day
  * Report 95th percentile as model score

In [2]:
import datetime
import calendar
import time
import json
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

Load rainfall and flow data from the files and clean it by:
  * Resampling to 5 minutes
  * Slice to the common range
  * Fill NaNs

In [3]:
project_folder = '../../datasets/thorium-medium/'
flow = pd.read_csv(project_folder + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time')['flow'].fillna(0)
flow = flow.resample('5T').pad()
rainfall = pd.read_csv(project_folder + 'rainfall1.csv', parse_dates=['time'])
rainfall = rainfall.set_index('time')['rainfall'].fillna(0)
rainfall = rainfall.resample('5T').pad()
flow_rain = pd.concat([flow, rainfall], axis=1).dropna()
print(flow_rain.head())
print(flow_rain.tail())
flow_rain = flow_rain['2015-06-02':'2017-11-09']
flow = flow_rain['flow']
rainfall = flow_rain['rainfall']

                           flow  rainfall
time                                     
2015-06-01 14:15:00  115.559998       0.0
2015-06-01 14:20:00  115.199997       0.0
2015-06-01 14:25:00  112.209999       0.0
2015-06-01 14:30:00  112.860001       0.0
2015-06-01 14:35:00  113.349998       0.0
                           flow  rainfall
time                                     
2017-11-10 14:20:00  107.830002       0.0
2017-11-10 14:25:00  107.459999       0.0
2017-11-10 14:30:00  106.919998       0.0
2017-11-10 14:35:00  105.559998       0.0
2017-11-10 14:40:00  104.940002       0.0


## Prepare functions for calculating model score

The basic prediction model uses daily pattern as a prediction

In [4]:
class PredictionModel:

    def fit(self, flow, rain):
        pass
        
    def predict(self, day, rain):
        return np.zeros(288)

    
def loss(y_hat, y):
    """
    https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
    """
    return 100.0 * np.sum(np.abs((y-y_hat) / y)) / y.shape[0]


def split_data(flow, split_day):
    """Get all data up to given day"""
    end_day = split_day - pd.Timedelta('1 min')
    return flow[:end_day]


def evaluate_day(model, flow, rain, split_day):
    """Evaluate data for single day"""
    xs = split_data(flow, split_day)
    next_day = split_day + pd.Timedelta(1, 'D')
    y = flow[next_day: next_day+pd.Timedelta('1439 min')]
    model.fit(xs, rain)
    y_hat = model.predict(next_day, rain)
    return loss(y_hat, y)


def evaluate_model(model, flow, rain, start_day):
    """
    Evaluate model on all days starting from the split_day.
    Returns 95th percentile error as model score
    """
    last_day = flow.index[-1] - pd.Timedelta(1, 'D')
    split_day = start_day
    costs = []
    while split_day < last_day:
        cost = evaluate_day(model, flow, rain, split_day)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 95), costs


error = evaluate_day(PredictionModel(), flow, rainfall, pd.Timestamp('2016-11-10'))
print('Error: {:.2f}%'.format(error))

Error: 100.00%


# Evaluate some models for year 2017

## Mean model

In [5]:
class MeanModel:

    def fit(self, flow, rain):
        self.mean = np.mean(flow.values)
        
    def predict(self, day, rain):
        return np.ones(288) * self.mean

    
start_time = time.time()
score, costs = evaluate_model(MeanModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('MeanModel 95th percentile error: {:.2f}%'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))

MeanModel 95th percentile error: 22.69%
Calculated in 0.232 seconds


## Daily pattern model

In [5]:
class DailyPatternModel:
    
    def fit(self, flow, rain):
        """ Use daily pattern """
        df = flow.to_frame().reset_index()
        self.daily_pattern = df.groupby(by=[df.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        
    def predict(self, day, rain):
        return self.daily_pattern
    
    
start_time = time.time()    
score, costs = evaluate_model(DailyPatternModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('DailyPatternModel 95th percentile error: {:.2f}%'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))

DailyPatternModel 95th percentile error: 19.48%
Calculated in 234.036 seconds


## Last day model

In [6]:
class LastDayModel(PredictionModel):
    
    def fit(self, flow, rain):
        self.y = flow.values[-288:]
        
    def predict(self, day, rain):
        return self.y
    
score, costs = evaluate_model(LastDayModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('LastDayModel 95th percentile error: {:.2f}%'.format(score))    

LastDayModel 95th percentile error: 17.86%


## Daily pattern for working and non working days

In [7]:
class WeeklyPatternModel(PredictionModel):
    
    def fit(self, flow, rain):
        df = flow.to_frame().reset_index()
        df_working = df[df.time.dt.dayofweek < 5]
        df_weekend = df[df.time.dt.dayofweek > 4]
        self.daily_pattern_working = df.groupby(by=[df_working.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        self.daily_pattern_weekend = df.groupby(by=[df_weekend.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        
    def predict(self, day, rain):
        if day.dayofweek < 5:
            return self.daily_pattern_working
        else:
            return self.daily_pattern_weekend

    
score, costs = evaluate_model(WeeklyPatternModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('WeeklyPatternModel 95th percentile error: {:.2f}%'.format(score))    

WeeklyPatternModel 95th percentile error: 19.67%
