# TensorFlow: Mean model

Implement mean model (always predict mean value) in TensorFlow.

In [80]:
import datetime
import calendar
import time
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Prepare Input Data 

## Load and clean data

Load rainfall and flow data from the files and clean it by:
  * Resampling to 5 minutes
  * Slice to the common range
  * Fill NaNs

In [22]:
project_folder = '../../datasets/thorium-medium/'
flow = pd.read_csv(project_folder + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time')['flow'].fillna(0)
flow = flow.resample('5T').pad()
rainfall = pd.read_csv(project_folder + 'rainfall1.csv', parse_dates=['time'])
rainfall = rainfall.set_index('time')['rainfall'].fillna(0)
rainfall = rainfall.resample('5T').pad()
flow_rain = pd.concat([flow, rainfall], axis=1).dropna()
print(flow_rain.head())
print(flow_rain.tail())

                           flow  rainfall
time                                     
2015-06-01 14:15:00  115.559998       0.0
2015-06-01 14:20:00  115.199997       0.0
2015-06-01 14:25:00  112.209999       0.0
2015-06-01 14:30:00  112.860001       0.0
2015-06-01 14:35:00  113.349998       0.0
                           flow  rainfall
time                                     
2017-11-10 14:20:00  107.830002       0.0
2017-11-10 14:25:00  107.459999       0.0
2017-11-10 14:30:00  106.919998       0.0
2017-11-10 14:35:00  105.559998       0.0
2017-11-10 14:40:00  104.940002       0.0


## Process DateTime column

DateTime can be used to construct feature vector. But to allow working with dates in TensorFlow we need to convert it to separate fields:
  * year
  * day of year
  * minute of day

In [24]:
flow_rain['day'] = flow_rain.index.map(lambda x : x.dayofyear)
flow_rain['minute'] = flow_rain.index.map(lambda x : x.hour*60 + x.minute)
input_data = flow_rain[['day', 'minute', 'flow', 'rainfall']]
input_data.head()

Unnamed: 0_level_0,day,minute,flow,rainfall
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-06-01 14:15:00,152,855,115.559998,0.0
2015-06-01 14:20:00,152,860,115.199997,0.0
2015-06-01 14:25:00,152,865,112.209999,0.0
2015-06-01 14:30:00,152,870,112.860001,0.0
2015-06-01 14:35:00,152,875,113.349998,0.0


# Helper functions working on Pandas dataframe

In [45]:
def split_data(df_features, df_labels, split_day):
    """Split data into dataframe before given day and with the given day"""
    end_day = split_day - pd.Timedelta('1 min')
    next_day = split_day + pd.Timedelta('1439 min')
    return df_features[:end_day], df_labels[:end_day], df_features[split_day: next_day], df_labels[split_day: next_day]


x_train, y_train, x_test, y_test = split_data(input_data[['minute', 'day']], input_data.flow, pd.Timestamp('2016-11-10'))
print(x_train.tail())
print(x_test.head())
print(x_test.tail())

                     minute  day
time                            
2016-11-09 23:35:00    1415  314
2016-11-09 23:40:00    1420  314
2016-11-09 23:45:00    1425  314
2016-11-09 23:50:00    1430  314
2016-11-09 23:55:00    1435  314
                     minute  day
time                            
2016-11-10 00:00:00       0  315
2016-11-10 00:05:00       5  315
2016-11-10 00:10:00      10  315
2016-11-10 00:15:00      15  315
2016-11-10 00:20:00      20  315
                     minute  day
time                            
2016-11-10 23:35:00    1415  315
2016-11-10 23:40:00    1420  315
2016-11-10 23:45:00    1425  315
2016-11-10 23:50:00    1430  315
2016-11-10 23:55:00    1435  315


# Create TensorFlow graph

## Error calculation

In [26]:
def calculate_error_op(y_hat, y):
    """
    Create operation for calculating https://en.wikipedia.org/wiki/Mean_absolute_percentage_error
    """
    score_op = 100 * tf.reduce_mean(tf.abs((y-y_hat) / y))
    return score_op

# The value should be ~43.3(3)%
with tf.Session() as sess:
    op = calculate_error_op(tf.constant([1,2,3]), tf.constant([1,5,10]))
    print(sess.run(op))

43.33333333333333


# Define TensorFlow graph

Define base prediction model

In [97]:
class MeanModel:
    """
    This model always predicts mean value.
    """
    
    def __init__(self):
        """
        model - tensor consists of mean value
        """
        self.features = tf.placeholder(tf.float32, shape=(None, None))
        self.labels = tf.placeholder(tf.float32, shape=(None))
        self.model = tf.Variable([0.0])

    def train(self):
        """
        Create train operation.
        Params:
        x - Tensor with features
        y - Tensor with labels
        """
        mean = tf.reshape(tf.reduce_mean(self.labels), [1])
        assign_op = self.model.assign(mean)
        return assign_op
        
    def predict(self):
        """
        Create predict operation. It returns labels based on a given features
        """
        f = lambda _: self.model
        predict_op = tf.map_fn(f, self.features)
        return predict_op
    
    def loss(self):
        """
        Calculate prediction loss
        """
        predict_op = self.predict()
        error_op = calculate_error_op(predict_op, self.labels)
        return error_op

    
# Expected [[2.] [2.]]
model = MeanModel()
train_op = model.train()
predict_op = model.predict()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    features = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
    labels = np.array([1.0, 2.0, 3.0])
    start_time = time.time()
    sess.run(train_op, feed_dict={model.features: features, model.labels: labels})
    print(sess.run(predict_op, feed_dict={model.features: features}))
    print("Calculated in {:.3f} seconds".format(time.time() - start_time))

[[2.]
 [2.]]
Calculated in 0.908 seconds


In [91]:
def evaluate_day(sess, model, X, y, split_day):
    """Evaluate data for single day"""
    train_op = model.train()
    predict_op = model.predict()
    loss_op = model.loss()
    x_train, y_train, x_test, y_test = split_data(X, y, split_day)
    sess.run(train_op, feed_dict={model.features: x_train, model.labels: y_train})
    error = sess.run(loss_op, feed_dict={model.features: x_test, model.labels: y_test})
    return error


def evaluate_model(model, X, y, start_day):
    """
    Evaluate model on all days starting from the split_day.
    Returns 95th percentile error as model score
    """
    last_day = flow.index[-1] - pd.Timedelta(1, 'D')
    split_day = start_day
    costs = []
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    while split_day < last_day:
        cost = evaluate_day(sess, model, X, y, split_day)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 95), costs

start_time = time.time()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    error = evaluate_day(sess, MeanModel(), input_data[['day', 'minute']], input_data.flow, pd.Timestamp('2016-11-10'))
    print('Error: {:.2f}%'.format(error))
print("Calculated in {:.3f} seconds".format(time.time() - start_time)) 

Error: 98.95%
Calculated in 5.835 seconds


# Evaluate models for year 2017

Now when we have all functions ready we will evaluate model for each day of the 2017 year. 
And then report 95th percentile as a model error

In [85]:
start_time = time.time()
score, costs = evaluate_model(MeanModel(), input_data[['day', 'minute']], input_data.flow, pd.Timestamp('2017-01-01'))
print('MeanModel 95th percentile error: {:.2f}%'.format(score))
print("Calculated in {:.3f} seconds".format(time.time() - start_time))

MeanModel 95th percentile error: 22.68%
Calculated in 577.533 seconds
