# TensorFlow implementation of prediction modeling

This notebook tries to reimplement the whole pipe line in TensorFlow.

In [1]:
import datetime
import calendar
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

Load rainfall and flow data from the files and clean it by:
  * Resampling to 5 minutes
  * Slice to the common range
  * Fill NaNs

In [102]:
project_folder = '../../datasets/thorium-medium/'
flow = pd.read_csv(project_folder + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time')['flow'].fillna(0)
flow = flow.resample('5T').pad()
rainfall = pd.read_csv(project_folder + 'rainfall1.csv', parse_dates=['time'])
rainfall = rainfall.set_index('time')['rainfall'].fillna(0)
rainfall = rainfall.resample('5T').pad()
flow_rain = pd.concat([flow, rainfall], axis=1).dropna().reset_index()
print(flow_rain.head())
print(flow_rain.tail())

                 time        flow  rainfall
0 2015-06-01 14:15:00  115.559998       0.0
1 2015-06-01 14:20:00  115.199997       0.0
2 2015-06-01 14:25:00  112.209999       0.0
3 2015-06-01 14:30:00  112.860001       0.0
4 2015-06-01 14:35:00  113.349998       0.0
                      time        flow  rainfall
257185 2017-11-10 14:20:00  107.830002       0.0
257186 2017-11-10 14:25:00  107.459999       0.0
257187 2017-11-10 14:30:00  106.919998       0.0
257188 2017-11-10 14:35:00  105.559998       0.0
257189 2017-11-10 14:40:00  104.940002       0.0


## Convert Data into separate parts

DateTime can be used to construct feature vector. But to allow working with dates in TensorFlow we need to convert it to separate fields:
  * year
  * day of year
  * minute of day

In [103]:
flow_rain['year'] = flow_rain.time.map(lambda x : x.year)
flow_rain['day'] = flow_rain.time.map(lambda x : x.dayofyear)
flow_rain['minute'] = flow_rain.time.map(lambda x : x.hour*60 + x.minute)
dataset = flow_rain[['year', 'day', 'minute', 'flow', 'rainfall']]
dataset.head()

Unnamed: 0,year,day,minute,flow,rainfall
0,2015,152,855,115.559998,0.0
1,2015,152,860,115.199997,0.0
2,2015,152,865,112.209999,0.0
3,2015,152,870,112.860001,0.0
4,2015,152,875,113.349998,0.0


# Feature Engineering

This notebook we use the following features:
  * One hot encoding of minut of the day

In [87]:
def encode_time(times):
    """
    Encode 1D Tensors with times into one hot encoded 2D tensor with minutes from 00:00 as category
    """
    return times

print(flow.index)
x = tf.constant(flow.head().values)
x

DatetimeIndex(['2015-06-02 00:00:00', '2015-06-02 00:05:00',
               '2015-06-02 00:10:00', '2015-06-02 00:15:00',
               '2015-06-02 00:20:00', '2015-06-02 00:25:00',
               '2015-06-02 00:30:00', '2015-06-02 00:35:00',
               '2015-06-02 00:40:00', '2015-06-02 00:45:00',
               ...
               '2017-11-09 23:10:00', '2017-11-09 23:15:00',
               '2017-11-09 23:20:00', '2017-11-09 23:25:00',
               '2017-11-09 23:30:00', '2017-11-09 23:35:00',
               '2017-11-09 23:40:00', '2017-11-09 23:45:00',
               '2017-11-09 23:50:00', '2017-11-09 23:55:00'],
              dtype='datetime64[ns]', name='time', length=256896, freq='5T')


<tf.Tensor 'Const_1088:0' shape=(5,) dtype=float64>

# Define TensorFlow graph

The graph will consists of the following operations:
 * loss function
 * training
 * evaluation

In [82]:
class PredictionModel:

    def fit(self, flow, rain):
        pass
        
    def predict(self, day, rain):
        return tf.zeros(shape=288)

    
def rmse(y_hat, y):
    """
    https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
    This function as an input get 2 tensors which should have the same 1D size
    And returns TensorFlow graph operation which can be executed to get the score
    """
    score_op = 100 * tf.reduce_mean(tf.abs((y-y_hat) / y))
    return score_op


def split_data(flow, split_day):
    """Get all data up to given day"""
    end_day = split_day - pd.Timedelta('1 min')
    return flow[:end_day]


def evaluate_day(model, flow, rain, split_day):
    """Evaluate data for single day"""
    xs = split_data(flow, split_day)
    next_day = split_day + pd.Timedelta(1, 'D')
    y = flow[next_day: next_day+pd.Timedelta('1439 min')]
    y = tf.cast(tf.constant(y), tf.float32)
    with tf.Session() as sess:
        model.fit(xs, rain)
        y_hat = model.predict(next_day, rain)
        error = sess.run(rmse(y_hat, y))
    return error


def evaluate_model(model, flow, rain, start_day):
    """
    Evaluate model on all days starting from the split_day.
    Returns 95th percentile error as model score
    """
    last_day = flow.index[-1] - pd.Timedelta(1, 'D')
    split_day = start_day
    costs = []
    while split_day < last_day:
        cost = evaluate_day(model, flow, rain, split_day)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 95), costs


error = evaluate_day(PredictionModel(), flow, rainfall, pd.Timestamp('2016-11-10'))
print('Error: {:.2f}%'.format(error))

Error: 100.00%


# Evaluate some models for year 2017

## Mean model

In [61]:
class MeanModel:

    def fit(self, flow, rain):
        self.mean = np.mean(flow.values)
        
    def predict(self, day, rain):
        return np.ones(288) * self.mean

    
score, costs = evaluate_model(MeanModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('MeanModel 95th percentile error: {:.2f}%'.format(score))

MeanModel 95th percentile error: 22.69%


## Daily pattern model

In [22]:
class DailyPatternModel:
    
    def fit(self, flow, rain):
        """ Use daily pattern """
        df = flow.to_frame().reset_index()
        self.daily_pattern = df.groupby(by=[df.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        
    def predict(self, day, rain):
        idx = pd.date_range(pd.Timestamp('2016-11-10'), periods=288, freq='5T')
        return self.daily_pattern
    
    
score, costs = evaluate_model(DailyPatternModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('DailyPatternModel 95th percentile error: {:.2f}%'.format(score))

DailyPatternModel 95th percentile error: 19.48%


## Last day model

In [24]:
class LastDayModel(PredictionModel):
    
    def fit(self, flow, rain):
        self.y = flow.values[-288:]
        
    def predict(self, day, rain):
        return self.y
    
score, costs = evaluate_model(LastDayModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('LastDayModel 95th percentile error: {:.2f}%'.format(score))    

LastDayModel 95th percentile error: 17.86%


## Daily pattern for working and non working days

In [29]:
class WeeklyPatternModel(PredictionModel):
    
    def fit(self, flow, rain):
        df = flow.to_frame().reset_index()
        df_working = df[df.time.dt.dayofweek < 5]
        df_weekend = df[df.time.dt.dayofweek > 4]
        self.daily_pattern_working = df.groupby(by=[df_working.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        self.daily_pattern_weekend = df.groupby(by=[df_weekend.time.map(lambda x : (x.hour, x.minute))]).flow.mean().values
        
    def predict(self, day, rain):
        if day.dayofweek < 5:
            return self.daily_pattern_working
        else:
            return self.daily_pattern_weekend

    
score, costs = evaluate_model(WeeklyPatternModel(), flow, rainfall, pd.Timestamp('2017-01-01'))
print('WeeklyPatternModel 95th percentile error: {:.2f}%'.format(score))    

WeeklyPatternModel 95th percentile error: 19.67%
