# Prediction using rainfall data

The best model without rainfall data is Mean Daily Pattern Model.
In this notebook we try to improve this model by using precipitation information.

In [9]:
import datetime
import calendar
import json
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4

# Load project

In [2]:
project_folder = '../../datasets/thorium-medium/'
with open(project_folder + 'project.json', 'r') as file:
    project = json.load(file)
print(json.dumps(project, indent=4))
flow = pd.read_csv(project_folder + 'flow1.csv', parse_dates=['time'])
flow = flow.set_index('time')['flow'].fillna(0)
flow = flow.resample('5T').pad()
rainfall = pd.read_csv(project_folder + 'rainfall1.csv', parse_dates=['time'])
rainfall = rainfall.set_index('time')['rainfall'].fillna(0)
rainfall = rainfall.resample('5T').pad()
flow_rain = pd.concat([flow, rainfall], axis=1).dropna()
print(flow_rain.head())
print(flow_rain.tail())

{
    "end-date": "2017-10-01",
    "start-date": "2015-06-02",
    "rainfalls": [
        "rainfall1"
    ],
    "name": "thorium-medium",
    "split-date": "2017-01-01",
    "flows": [
        "flow1"
    ]
}
                           flow  rainfall
time                                     
2015-06-01 14:15:00  115.559998       0.0
2015-06-01 14:20:00  115.199997       0.0
2015-06-01 14:25:00  112.209999       0.0
2015-06-01 14:30:00  112.860001       0.0
2015-06-01 14:35:00  113.349998       0.0
                           flow  rainfall
time                                     
2017-11-10 14:20:00  107.830002       0.0
2017-11-10 14:25:00  107.459999       0.0
2017-11-10 14:30:00  106.919998       0.0
2017-11-10 14:35:00  105.559998       0.0
2017-11-10 14:40:00  104.940002       0.0


## Helper functions

Helper functions for building training and test sets and calculating score

In [3]:
class PredictionModel:
    
    def fit(self, data_points):
        pass
        
    def predict(self, prediction_day):
        pass

    
def mae(y_hat, y):
    """
    Calculate Mean Absolute Error 
    This metric is better here since series have quite big outliers
    """
    return np.sum(np.absolute(y_hat-y))/y.shape[0]


def split_data(split_day):
    """Get all data up to given day"""
    end_day = split_day - pd.Timedelta('1 min')
    return flow[:end_day]


def evaluate_day(model, split_day):
    """Evaluate data for single day"""
    xs = split_data(split_day)
    next_day = split_day + pd.Timedelta(1, 'D')
    y = flow[next_day: next_day+pd.Timedelta('1439 min')]
    model.fit(xs)
    y_hat = model.predict(next_day)
    return mae(y_hat, y)


def evaluate_model(model, start_day):
    """
    Evaluate model on all days starting from split_day.
    Returns 90th percentile error as model score
    """
    last_day = pd.Timestamp(project['end-date'])
    split_day = start_day
    costs = []
    while split_day < last_day:
        cost = evaluate_day(model, split_day)
        costs.append(cost)
        split_day += pd.Timedelta(1, 'D')
    return np.percentile(costs, 90), costs


split_data(pd.Timestamp('2016-11-10')).tail()

time
2016-11-09 23:35:00    105.589996
2016-11-09 23:40:00    105.540001
2016-11-09 23:45:00    104.260002
2016-11-09 23:50:00    100.989998
2016-11-09 23:55:00     99.059998
Freq: 5T, Name: flow, dtype: float64

# Models

## Last hour model

Use daily pattern and add inflow based on regression calculated from 1h precipitation

In [71]:
MIN_PRECIPITATION = 4
flow_rain['precipitation_1h'] = flow_rain.rainfall.rolling(12).sum()
df = flow_rain[flow_rain.precipitation_1h < MIN_PRECIPITATION].reset_index()
daily_pattern = df.groupby(by=[df.time.map(lambda x : (x.hour, x.minute))]).flow.mean()

def inflow(row):
    pattern_idx = (row.name.hour, row.name.minute)
    pattern_value = daily_pattern[pattern_idx]
    return row.flow - pattern_value

flow_rain['inflow'] = flow_rain.apply(inflow, axis=1)

In [72]:
class InflowModel(PredictionModel):
    
    def fit(self, xs):
        self.daily_pattern = daily_pattern.values
        df = flow_rain[(flow_rain.precipitation_1h >= MIN_PRECIPITATION) & (flow_rain.inflow < 100)]
        self.clf = tree.DecisionTreeRegressor()
        self.clf = self.clf.fit(df[['precipitation_1h']].values, df.inflow.values)
        
    def predict(self, day):
        df = flow_rain[day: day + pd.Timedelta('1439 min')]
        df2 = (df.precipitation_1h >= MIN_PRECIPITATION) & (df.inflow < 100)
        return daily_pattern.values + self.clf.predict(df[['precipitation_1h']].values) * df2

    
score, costs = evaluate_model(InflowModel(), pd.Timestamp('2016-11-11'))
print('InflowModel score: {:.2f}'.format(score))

InflowModel score: 15.84


In [None]:
class DecisionTreeModel(PredictionModel):
    
    def fit(self, xs):
        self.daily_pattern = daily_pattern.values
#         df_inflow = df[df.precipitation_1h > 3]
        self.clf = tree.DecisionTreeRegressor()
        self.clf = self.clf.fit(df[['precipitation_1h']], df.inflow)
        
    def predict(self, day):
        df = flow_rain[day: day + pd.Timedelta('1439 min')]
        return daily_pattern.values + self.clf.predict(df[['precipitation_1h']].values)

    
score, costs = evaluate_model(InflowModel(), pd.Timestamp('2016-11-11'))
print('InflowModel score: {:.2f}'.format(score))