# Online prediction for Project1

In online mode, the model is learning as soon as new data arrives.
It means that when we want our prediction we don't need to provede feature vector, 
since all data was already processed by model.

Table of contents:

  * Load model and create training and test datasets


In [1]:
import datetime
import calendar
import pprint
import json
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

In [45]:
project_folder = '../../datasets/project1/'
with open(project_folder + 'project.json', 'r') as file:
    project = json.load(file)
pprint.pprint(project)
print('Flow1')
flow = pd.read_csv(project_folder + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time')['flow'].fillna(0)
flow = flow.resample('5T').pad()
flow.head()

{'end-date': '2017-11-09',
 'flows': ['flow1'],
 'name': 'Project1',
 'rainfalls': [],
 'split-date': '2016-11-10',
 'start-date': '2013-09-10'}
Flow1


time
2013-09-09 10:50:00     2.805
2013-09-09 10:55:00     2.796
2013-09-09 11:00:00     2.791
2013-09-09 11:05:00    46.680
2013-09-09 11:10:00    48.030
Freq: 5T, Name: flow, dtype: float64

## Helper functions

Helper functions for building training and test sets and calculating score

In [46]:
class PredictionModel:
    
    def updare(self, data_points):
        pass
        
    def predict(self, prediction_day):
        pass

    
def mae(y_hat, y):
    """
    Calculate Mean Absolute Error 
    This metric is better here since serries have quite big outliers
    """
    return np.sum(np.absolute(y_hat-y))/y.shape[0]


def split_data(split_day):
    """Get all data up to given day"""
    return flow[:split_day]


def evaluate_model(model, start_day):
    """
    Evaluate model on all days starting from split_day.
    Returns 90th percentile error as model score
    """
    last_day = pd.Timestamp(project['end-date'])
    split_day = start_day
    costs = []
    while split_day < last_day:
        data = split_data(split_day)
        model.fit(data)
        next_day = split_day + pd.Timedelta(1, 'D')
        y = flow[next_day: next_day+pd.Timedelta('1439 min')]
        cost = mae(model.predict(next_day), y)
        costs.append(cost)
        split_day = next_day
    return np.percentile(costs, 90), costs


split_data(pd.Timestamp('2016-11-10')).tail()

time
2016-11-09 23:40:00    77.021540
2016-11-09 23:45:00    76.518930
2016-11-09 23:50:00    76.294914
2016-11-09 23:55:00    74.822395
2016-11-10 00:00:00    73.598236
Freq: 5T, Name: flow, dtype: float64

# Models

# ConstMeanModel

In [60]:
class ConstantMeanModel(PredictionModel):
    
    def __init__(self):
        self.mu = 0
    
    def fit(self, xs):
        self.mu = np.mean(xs)
        
    def predict(self, day):
        return np.ones(12*24) * self.mu    

    
score, costs = evaluate_model(ConstantMeanModel(), pd.Timestamp('2016-11-11'))
print('ConstantMeanModel score: {:.2f}'.format(score))

ConstantMeanModel score: 18.86


## LastDayModel

Uses values from last day

In [61]:
class LastDayModel(PredictionModel):
    
    def fit(self, xs):
        self.y = xs.values[-288:]
        
    def predict(self, day):
        return self.y

    
score, costs = evaluate_model(LastDayModel(), pd.Timestamp('2016-11-11'))
print('LastDayModel score: {:.2f}'.format(score))

LastDayModel score: 12.02


## Day model

Calculate value for each hour from all days in the training set. Use it as prediction model for the next day


In [77]:
class DayModel(PredictionModel):
    
    def fit(self, xs):
        df = flow.to_frame().reset_index()
        self.y = df.groupby(by=[df.time.map(lambda x : (x.hour, x.minute))]).mean().flow.values
        
    def predict(self, day):
        return self.y

    
score, costs = evaluate_model(DayModel(), pd.Timestamp('2016-11-11'))
print('DayModel score: {:.2f}'.format(score))

DayModel score: 9.61
