# [WIP] Prediction model based on recurrention

This model internally always predicts only next value. The prediction for the next 24h hours is achived by 
applying output from the first prediction as an input to the next prediction (together with other features)

In [1]:
import datetime
import time
import calendar
import json
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

In [2]:
project_folder = '../../datasets/thorium-medium/'
with open(project_folder + 'project.json', 'r') as file:
    project = json.load(file)
print(json.dumps(project, indent=4))
flow = pd.read_csv(project_folder + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time')['flow'].fillna(0)
flow = flow.resample('5T').pad()
rainfall = pd.read_csv(project_folder + 'rainfall1.csv', parse_dates=['time'])
rainfall = rainfall.set_index('time')['rainfall'].fillna(0)
rainfall = rainfall.resample('5T').pad()
flow_rain = pd.concat([flow, rainfall], axis=1).dropna()
flow_rain = flow_rain['2015-06-02':'2017-11-09']
print(flow_rain.head())
print(flow_rain.tail())
flow = flow_rain['flow']
rainfall = flow_rain['rainfall']

{
    "rainfalls": [
        "rainfall1"
    ],
    "split-date": "2017-01-01",
    "end-date": "2017-10-01",
    "start-date": "2015-06-02",
    "flows": [
        "flow1"
    ],
    "name": "thorium-medium"
}
                           flow  rainfall
time                                     
2015-06-02 00:00:00  100.889999       0.0
2015-06-02 00:05:00   99.839996       0.0
2015-06-02 00:10:00   99.279999       0.0
2015-06-02 00:15:00   98.139999       0.0
2015-06-02 00:20:00   97.110001       0.0
                           flow  rainfall
time                                     
2017-11-09 23:35:00  111.739998       0.0
2017-11-09 23:40:00  110.949997       0.0
2017-11-09 23:45:00  110.519997       0.0
2017-11-09 23:50:00  109.190002       0.0
2017-11-09 23:55:00  107.089996       0.0


## Helper functions

Helper functions for building training and test sets and calculating score

In [3]:
class LastDayModel():
    """
    Fast base model as a reference point
    Expected error: 17.86%
    """
    
    def fit(self, flow, rain):
        self.y = flow.values[-288:]
        
    def predict(self, day, rain, last_flow):
        return self.y
    
    
def loss(y_hat, y):
    """
    https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
    """
    return 100.0 * np.sum(np.abs((y-y_hat) / y)) / y.shape[0]


def split_data(flow, split_day):
    """Get all data up to given day"""
    end_day = split_day - pd.Timedelta('1 min')
    return flow[:end_day]


def evaluate_day(model, flow, rain, split_day):
    """Evaluate data for single day"""
    xs = split_data(flow, split_day)
    next_day = split_day + pd.Timedelta(1, 'D')
    y = flow[next_day: next_day+pd.Timedelta('1439 min')]
    model.fit(xs, rain)
    y_hat = model.predict(next_day, rain, xs[-1])
    return loss(y_hat, y)


def evaluate_model(model, flow, rain, start_day):
    """
    Evaluate model on all days starting from split_day.
    Returns 90th percentile error as model score
    """
    last_day = flow.index[-1] - pd.Timedelta(1, 'D')
    split_day = start_day
    costs = []
    while split_day < last_day:
        cost = evaluate_day(model, flow, rain, split_day)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 95), costs

score, costs = evaluate_model(LastDayModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('LastDayModel score: {:.2f}% (expected: 17.86%)'.format(score))

LastDayModel score: 17.86% (expected: 17.86%)


# Extract features

Extract the following features:

  * Minutes of the day
  * Previous flow value
  * Total precipitation in the last N hours

In [4]:
def encode_time(xs):
    """
    Encode time as Int value
    Params:
      times - pd.Series with Timestamp
    Return:
      pd.Series with encoded time
    """
    return xs.map(lambda x: x.hour*60+x.minute)


def encode_features(flow, rain, last_rain_window = 1):
    """
    Create feature vector based on 
    Return feature and expected values
    """
    df = pd.concat([flow, rain], axis=1).reset_index()
    df['last_rain'] = df.rainfall.rolling(last_rain_window*12).sum()
    df['minutes'] = encode_time(df.time)
    df['last_flow'] = df.flow.shift()
    df = df.dropna()
    return (df[['minutes', 'last_flow', 'last_rain']], df.flow)


def prepare_prediction_features(day, rain, last_rain_window = 1):
    last_rain = rain.rolling(last_rain_window*12).sum()
    ts = pd.date_range(day, periods=288, freq='5T')
    minutes = pd.Series(data=encode_time(ts), index=ts)
    df = pd.concat([minutes, last_rain], axis=1).dropna()
    return df

X, y = encode_features(flow[:'2016-12-31'], rainfall[:'2016-12-31'])
print(X.tail())
print(y.tail())

        minutes  last_flow     last_rain
166747     1415  99.080002 -1.865175e-14
166748     1420  98.650002 -1.865175e-14
166749     1425  98.199997 -1.865175e-14
166750     1430  97.650002 -1.865175e-14
166751     1435  97.019997 -1.865175e-14
166747    98.650002
166748    98.199997
166749    97.650002
166750    97.019997
166751    96.860001
Name: flow, dtype: float64


# Models

## Linear regression

As a baseline lets try Linear Model

In [17]:
from sklearn.linear_model import LinearRegression

class LinearModel:
    
    def __init__(self, rain_window_size=1):
        self.rain_window_size = rain_window_size
        self.clf = LinearRegression()
    
    def fit(self, flow, rain):
        X, y = encode_features(flow, rain)
        self.clf.fit(X.values, y.values)
        
    def predict(self, day, rain, last_flow):
        base_features = prepare_prediction_features(day, rain).values
        predictions = []
        flow = last_flow
        for row in base_features:
            feature = np.array([[row[0], flow, row[1]]])
            pred = self.clf.predict(feature)[0]
            predictions.append(pred)
            flow = pred
        return np.array(predictions)

start_time = time.time()
model = LinearModel()
score, costs = evaluate_model(model, flow_rain.flow, flow_rain.rainfall, pd.Timestamp('2017-01-01'))
print('LinearModel score: {:.2f}%'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))
print('Model coef: {}, intercept: {}'.format(model.clf.coef_, model.clf.intercept_))

LinearModel score: 21.71%
Calculated in 304.371 seconds
Model coef: [ -9.37087226e-05   9.96541233e-01   7.65309783e-02], intercept: 0.41214071862701473


## Decision Tree Regressor

First non linear model. Should improve on linear model

In [10]:
from sklearn import tree

class DTModel(LinearModel):
    
    def __init__(self):
        self.clf = tree.DecisionTreeRegressor()


start_time = time.time()
model = DTModel()
score, costs = evaluate_model(model, flow_rain.flow, flow_rain.rainfall, pd.Timestamp('2017-01-01'))
print('DTModel 2h score: {:.2f}%'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))
model.clf.feature_importances_

DTModel 2h score: 18.10%
Calculated in 567.181 seconds


## XGBoost

In [27]:
import xgboost as xg

class XGBoostModel(LinearModel):
    
    def __init__(self, rain_window_size=2):
        self.rain_window_size = rain_window_size
        self.clf = xg.XGBRegressor()
    

start_time = time.time()
score, costs = evaluate_model(XGBoostModel(2), flow_rain.flow, flow_rain.rainfall, pd.Timestamp('2017-01-01'))
print('XGBoostModel 2h score: {:.2f}'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))

XGBoostModel 2h score: 17.36
Calculated in 871.612 seconds
