# Prediction models for Project1

This notebook explores the following models:

  * MeanModel - Predicts mean value for all future values
  * LastDayModel - Predicts the same values like last day (given as futures)

In [23]:
import datetime
import pprint
import json
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

## Load project

In [57]:
project_folder = '../datasets/project1/'
with open(project_folder + 'project.json', 'r') as file:
    project = json.load(file)
pprint.pprint(project)
print('Flow1')
flow = pd.read_csv('../datasets/project1/flow1.csv', parse_dates=['time'])
flow = flow[(flow.time >= project['start-date']) & (flow.time < project['end-date'])]
print(flow.info())
flow.head()

{'end-date': '2017-11-09',
 'flows': ['flow1'],
 'name': 'Project1',
 'rainfalls': [],
 'split-date': '2016-11-10',
 'start-date': '2013-09-10'}
Flow1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 437823 entries, 158 to 437980
Data columns (total 2 columns):
time    437823 non-null datetime64[ns]
flow    437823 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 10.0 MB
None


Unnamed: 0,time,flow
158,2013-09-10 00:00:00,0.0
159,2013-09-10 00:05:00,0.0
160,2013-09-10 00:10:00,0.0
161,2013-09-10 00:15:00,0.0
162,2013-09-10 00:20:00,0.0


## Create train and test dataset

Dataset consists of the following features:

  * Vector of last 24h data
  
and target value:

  * Vector of next 24 predictions

In [86]:
flow['day'] = flow.time.map(pd.Timestamp.date)
df = flow.groupby('day')['flow'].apply(list)
print(df.head())
dataset = pd.DataFrame({'flow_last_24h': df.shift(), 'target': df})
dataset = dataset[datetime.date(2013, 9, 13):]
dataset.head()

day
2013-09-10    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2013-09-11    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2013-09-12    [82.87, 82.79, 81.71, 80.66, 79.71, 79.0, 77.7...
2013-09-13    [77.74, 75.79, 74.45, 73.56, 72.8, 70.86, 70.6...
2013-09-14    [78.26, 76.82, 76.2, 74.92, 74.39, 73.36, 72.5...
Name: flow, dtype: object


Unnamed: 0_level_0,flow_last_24h,target
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-09-13,"[82.87, 82.79, 81.71, 80.66, 79.71, 79.0, 77.7...","[77.74, 75.79, 74.45, 73.56, 72.8, 70.86, 70.6..."
2013-09-14,"[77.74, 75.79, 74.45, 73.56, 72.8, 70.86, 70.6...","[78.26, 76.82, 76.2, 74.92, 74.39, 73.36, 72.5..."
2013-09-15,"[78.26, 76.82, 76.2, 74.92, 74.39, 73.36, 72.5...","[78.25, 75.89, 74.8, 73.52, 73.93, 73.4, 73.64..."
2013-09-16,"[78.25, 75.89, 74.8, 73.52, 73.93, 73.4, 73.64...","[70.27, 69.54, 67.95, 66.94, 66.25, 65.56, 65...."
2013-09-17,"[70.27, 69.54, 67.95, 66.94, 66.25, 65.56, 65....","[75.92, 72.4, 70.43, 70.35, 70.81, 70.82, 69.7..."


# Helper functions

Helper functions for building training and test sets and calculating score

In [15]:
class PredictionModel:
    
    def fit(self, X, y):
        pass
        
    def predict(self, X):
        pass

    
def split(df, day):
    """
    Split dataset into training set and test set on a given day.
    All data before given day will be added to the training set, 
    while data from the given day will be used to create test set
    """
    next_day = day + pd.Timedelta(1, 'D')
    train = df[df.time < day]
    test = df[(df.time >= day) & (df.time < next_day)]
    X_train = train[['time']]
    Y_train = train['flow']
    X_test = test[['time']]
    Y_test = test['flow']
    return X_train, Y_train, X_test, Y_test


def rmse(y_hat, y):
    """
    Calculate Root Mean Square Error
    """
    return np.sqrt(mean_squared_error(y_hat, y))


def evaluate_model(model, split_day):
    """
    Evaluate model on all days starting from split_day.
    Returns 99th percentile error as model score
    """
    costs = []

    while True:
        X_train, Y_train, X_test, Y_test = split(dataset, split_day)
        if len(X_test) == 0:
            break
        model.fit(X_train, Y_train)
        cost = rmse(model.predict(X_test), Y_test)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 99)

# Models

## MeanModel

Calculate mean from all datapoint in the training set. 
Ignore features and predict constant value (equal to this mean) for all predicted values

In [16]:
class MeanModel(PredictionModel):
    
    def __init__(self):
        self.mu = 0
    
    def fit(self, X, y):
        self.mu = np.mean(y)
        
    def predict(self, X):
        return np.ones(len(X)) * self.mu    


score = evaluate_model(MeanModel(), pd.Timestamp('2016-11-11'))
print('MeanModel score: {:.2f}'.format(score))

MeanModel score: 35.60



## Last day model

Here we will build 2 naive models. 

First on always predicts 0 value. The second one predicts mean value. 

We will evaluate which model is better