# Prediction based on daily pattern model

Compare the following models

  * Previous day model - Next day is the same like previous day
  * Daily Pattern model - Calculate daily pattern from historical data. Use it as next day prediction.

In [1]:
import datetime
import calendar
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

In [19]:
project_folder = '../../datasets/thorium-medium/'
with open(project_folder + 'project.json', 'r') as file:
    project = json.load(file)
print(json.dumps(project, indent=4))
flow = pd.read_csv(project_folder + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time')['flow'].fillna(0)
flow = flow.resample('5T').pad()
rainfall = pd.read_csv(project_folder + 'rainfall1.csv', parse_dates=['time'])
rainfall = rainfall.set_index('time')['rainfall'].fillna(0)
rainfall = rainfall.resample('5T').pad()
flow_rain = pd.concat([flow, rainfall], axis=1).dropna()
print(flow_rain.head())
print(flow_rain.tail())

{
    "start-date": "2015-06-02",
    "flows": [
        "flow1"
    ],
    "end-date": "2017-10-01",
    "rainfalls": [
        "rainfall1"
    ],
    "split-date": "2017-01-01",
    "name": "thorium-medium"
}
                           flow  rainfall
time                                     
2015-06-01 14:15:00  115.559998       0.0
2015-06-01 14:20:00  115.199997       0.0
2015-06-01 14:25:00  112.209999       0.0
2015-06-01 14:30:00  112.860001       0.0
2015-06-01 14:35:00  113.349998       0.0
                           flow  rainfall
time                                     
2017-11-10 14:20:00  107.830002       0.0
2017-11-10 14:25:00  107.459999       0.0
2017-11-10 14:30:00  106.919998       0.0
2017-11-10 14:35:00  105.559998       0.0
2017-11-10 14:40:00  104.940002       0.0


## Helper functions

Helper functions for building training and test sets and calculating score

In [13]:
class PredictionModel:
    
    def fit(self, data_points):
        pass
        
    def predict(self, prediction_day):
        pass

    
def mae(y_hat, y):
    """
    Calculate Mean Absolute Error 
    This metric is better here since series have quite big outliers
    """
    return np.sum(np.absolute(y_hat-y))/y.shape[0]


def split_data(split_day):
    """Get all data up to given day"""
    end_day = split_day - pd.Timedelta('1 min')
    return flow[:end_day]


def evaluate_day(model, split_day):
    """Evaluate data for single day"""
    xs = split_data(split_day)
    next_day = split_day + pd.Timedelta(1, 'D')
    y = flow[next_day: next_day+pd.Timedelta('1439 min')]
    model.fit(xs)
    y_hat = model.predict(next_day)
    return mae(y_hat, y)


def evaluate_model(model, start_day):
    """
    Evaluate model on all days starting from split_day.
    Returns 90th percentile error as model score
    """
    last_day = pd.Timestamp(project['end-date'])
    split_day = start_day
    costs = []
    while split_day < last_day:
        cost = evaluate_day(model, split_day)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 90), costs


split_data(pd.Timestamp('2016-11-10')).tail()

time
2016-11-09 23:35:00    105.589996
2016-11-09 23:40:00    105.540001
2016-11-09 23:45:00    104.260002
2016-11-09 23:50:00    100.989998
2016-11-09 23:55:00     99.059998
Freq: 5T, Name: flow, dtype: float64

# Models

## Previous Day Model

Uses values from last day

In [15]:
class LastDayModel(PredictionModel):
    
    def fit(self, xs):
        self.y = xs.values[-288:]
        
    def predict(self, day):
        return self.y

    
score, costs = evaluate_model(LastDayModel(), pd.Timestamp('2016-11-11'))
print('LastDayModel score: {:.2f}'.format(score))

LastDayModel score: 16.21


Model for single day. Easy case

In [16]:
evaluate_day(LastDayModel(), pd.Timestamp('2016-11-11'))

12.860659811231825

And when next day is kind of outlier

In [17]:
evaluate_day(LastDayModel(), pd.Timestamp('2017-05-01'))

25.201145966847736

## Daily Pattern model

Create pattern of daily usage based on historical data. Use this pattern to predict next values

(This can take up to 10 minutes to calculate)

In [18]:
class DailyPatternModel(PredictionModel):
    
    def fit(self, xs):
        df = flow.to_frame().reset_index()
        self.daily_pattern = df.groupby(by=[df.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        
    def predict(self, day):
        return self.daily_pattern

    
score, costs = evaluate_model(DailyPatternModel(), pd.Timestamp('2016-11-11'))
print('DailyPatternModel score: {:.2f}'.format(score))

DailyPatternModel score: 15.95


### Daily Pattern Median Model
Calculate median value for each time. Use it as a prediction for the next day.

In [20]:
class DayMedianModel(PredictionModel):
    
    def fit(self, xs):
        df = flow.to_frame().reset_index()
        self.daily_pattern = df.groupby(by=[df.time.map(lambda x : (x.hour, x.minute))]).flow.median().values
        
    def predict(self, day):
        return self.daily_pattern

    
score, costs = evaluate_model(DayMedianModel(), pd.Timestamp('2016-11-11'))
print('DayModel score: {:.2f}'.format(score))

DayModel score: 17.47


## Daily pattern for working and non-working days

For same data the daily pattern is different for working and non-working days. Lets check if it improves the model here.

In [26]:
class WeeklyPatternModel(PredictionModel):
    
    def fit(self, xs):
        df = flow.to_frame().reset_index()
        df_working = df[df.time.dt.dayofweek < 5]
        df_weekend = df[df.time.dt.dayofweek > 4]
        self.daily_pattern_working = df.groupby(by=[df_working.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        self.daily_pattern_weekend = df.groupby(by=[df_weekend.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        
    def predict(self, day):
        if day.dayofweek < 5:
            return self.daily_pattern_working
        else:
            return self.daily_pattern_weekend

    
score, costs = evaluate_model(WeeklyPatternModel(), pd.Timestamp('2016-11-11'))
print('WeeklyPatternModel score: {:.2f}'.format(score))

WeeklyPatternModel score: 15.90
