### Basics Libraries and Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Loading data

In [2]:
features = ["date","hour","flow","anomaly"]
df = pd.read_csv('barreiro_ano.csv', sep=';', names=features)
holidays = pd.read_csv('holidays2018.csv',sep=';')

### Basic dataset changes

In [3]:
# drops the anomaly column
df.drop(['anomaly'], axis='columns', inplace=True)

In [4]:
# converts hour in indexes
indexes={'00:07:30':0, '00:22:30':1, '00:37:30':2, '00:52:30':3,
         '01:07:30':4, '01:22:30':5, '01:37:30':6, '01:52:30':7,
         '02:07:30':8, '02:22:30':9, '02:37:30':10, '02:52:30':11,
         '03:07:30':12, '03:22:30':13, '03:37:30':14, '03:52:30':15,
         '04:07:30':16, '04:22:30':17, '04:37:30':18, '04:52:30':19,
         '05:07:30':20, '05:22:30':21, '05:37:30':22, '05:52:30':23,
         '06:07:30':24, '06:22:30':25, '06:37:30':26, '06:52:30':27,
         '07:07:30':28, '07:22:30':29, '07:37:30':30, '07:52:30':32,
         '08:07:30':32, '08:22:30':33, '08:37:30':34, '08:52:30':35,
         '09:07:30':36, '09:22:30':37, '09:37:30':38, '09:52:30':39,
         '10:07:30':40, '10:22:30':41, '10:37:30':42, '10:52:30':43,
         '11:07:30':44, '11:22:30':45, '11:37:30':46, '11:52:30':47,
         '12:07:30':48, '12:22:30':49, '12:37:30':50, '12:52:30':51,
         '13:07:30':52, '13:22:30':53, '13:37:30':54, '13:52:30':55,
         '14:07:30':56, '14:22:30':57, '14:37:30':58, '14:52:30':59,
         '15:07:30':60, '15:22:30':61, '15:37:30':62, '15:52:30':63,
         '16:07:30':64, '16:22:30':65, '16:37:30':66, '16:52:30':67,
         '17:07:30':68, '17:22:30':69, '17:37:30':70, '17:52:30':71,
         '18:07:30':72, '18:22:30':73, '18:37:30':74, '18:52:30':75,
         '19:07:30':76, '19:22:30':77, '19:37:30':78, '19:52:30':79,
         '20:07:30':80, '20:22:30':81, '20:37:30':82, '20:52:30':83,
         '21:07:30':84, '21:22:30':85, '21:37:30':86, '21:52:30':87,
         '22:07:30':88, '22:22:30':89, '22:37:30':90, '22:52:30':91,
         '23:07:30':92, '23:22:30':93, '23:37:30':94, '23:52:30':95}

# in order to apply the indexes
df['hour'] = [indexes[item] for item in df['hour']]

In [5]:
# pass the date columns to the index
df['date'] = pd.to_datetime(df['date'], dayfirst=True).dt.strftime('%Y-%m-%d').astype(str)
df['date'] = pd.DatetimeIndex(data=df['date'], dtype='datetime64[ns]', name='date', freq=None)
df = df.set_index('date')
# due to errors, we passed the date as an index of the dataset

In [6]:
pd.set_option("display.max_rows", 200)
df.head(200)

Unnamed: 0_level_0,hour,flow
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,0,18.333067
2018-01-01,1,18.333067
2018-01-01,2,19.784872
2018-01-01,3,22.294744
2018-01-01,4,27.229756
2018-01-01,5,27.615883
2018-01-01,6,26.663889
2018-01-01,7,24.244111
2018-01-01,8,22.159583
2018-01-01,9,18.577917


## Lags of time series

### 10 previous readings

In [7]:
# in order to construct a new dataframe with D lagged values
df_lagged = df.copy()
number_lags = 10 # this is D

for i in range(1, number_lags+1):
    shifted = df['flow'].shift(i)
    df_lagged = pd.concat((df_lagged, shifted), axis=1)

In [8]:
# just to label the columns of the resulting dataframe
lagged_cols=["n-"+ str(x) for x in range(1,number_lags+1)]
colnames=["hour","n"]+lagged_cols
df_lagged.columns=colnames
df_lagged.head()

Unnamed: 0_level_0,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-01,0,18.333067,,,,,,,,,,
2018-01-01,1,18.333067,18.333067,,,,,,,,,
2018-01-01,2,19.784872,18.333067,18.333067,,,,,,,,
2018-01-01,3,22.294744,19.784872,18.333067,18.333067,,,,,,,
2018-01-01,4,27.229756,22.294744,19.784872,18.333067,18.333067,,,,,,


### - first day

In [9]:
# in order to drop the first 96 rows
df_lagged = df_lagged.iloc[96:, :]

### Selects data

In [10]:
# to extract only the lagged values for a specific time of the day
# '09:07:30'
X=df_lagged[df_lagged['hour']==36]
X

Unnamed: 0_level_0,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-01-02,36,27.891244,27.032378,25.853372,26.663911,26.901883,22.825889,23.315750,26.222811,18.563944,23.993039,20.549372
2018-01-03,36,29.384411,28.303822,28.019183,32.619433,35.274683,30.146444,35.819239,39.155139,30.970794,27.885411,23.554544
2018-01-04,36,26.402150,30.625439,32.003411,32.124433,32.319306,35.177678,39.074994,39.210860,31.088556,29.551022,23.684256
2018-01-05,36,23.162161,25.266222,25.069714,28.147304,32.233878,30.782600,37.183911,36.682972,27.989978,31.486211,23.892372
2018-01-06,36,32.252917,29.763883,24.828594,24.983883,20.047044,19.578261,20.016311,20.419050,14.961911,20.902456,17.469717
...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-27,36,28.926228,25.840367,25.091561,26.449022,26.366422,22.880300,21.540528,21.510583,18.446900,15.844733,14.171133
2018-12-28,36,29.847183,31.235122,28.610106,25.545911,27.475039,21.417922,22.062256,18.795039,18.326167,16.806472,12.424922
2018-12-29,36,28.895756,26.590378,23.702844,24.343911,20.284433,17.317983,14.572317,12.865272,10.646539,10.688817,9.710161
2018-12-30,36,28.315661,24.239617,23.822600,19.399444,16.524283,15.482278,13.567911,11.677278,11.355806,10.127611,8.993256


### Train test split

In [11]:
# X is the data to predict 9h
y = X['n'].copy()
X = X.drop(['n'], axis = 1)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4)

In [13]:
# TIME SERIES SPLIT
# Provides train/test indices to split time series data samples that are observed at fixed time intervals, in train/test sets.
# In each split, test indices must be higher than before, and thus shuffling in cross validator is inappropriate.
# This cross-validation object is a variation of KFold. In the kth split, it returns first k folds as train set and the (k+1)th fold as test set.
#import numpy as np
#from sklearn.model_selection import TimeSeriesSplit
#tscv = TimeSeriesSplit(n_splits=1, test_size=2)
#tscv
#for train, test in tscv.split(X):
#    print("TRAIN:", train, "TEST:", test)
#    X_train, X_test = X[train], X[test]
#    y_train, y_test = y[train], y[test]

### SVR training and testing

In [14]:
from sklearn import svm
svr_rbf = svm.SVR(kernel='rbf', epsilon=0.5, C=99, gamma=1)

In [15]:
regressor=svr_rbf.fit(X_train,y_train)
y_pred=svr_rbf.predict(X_test)

### Evaluation Metrics

In [16]:
from sklearn.metrics import mean_squared_error
# Mean squared error function loss
# The mean_squared_error function computes mean square error, a risk metric corresponding to the expected value of the squared (quadratic) error or loss
error = mean_squared_error(y_test, y_pred)
print('RBF model Test MSE: %.3f' % error)

RBF model Test MSE: 237.563


## Parameter tuning

### GridSearchCV

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

# SVR model
model=svm.SVR()
# dict of parameters
parameters={'kernel':['rbf'], 'gamma':['auto','scale'], 'epsilon':[0.01,0.05,0.5,1], 'C':[0.1, 1, 10, 100]}
# strategy to evaluate the performance of the cross-validated mode on the test set
scorer=make_scorer(mean_squared_error, greater_is_better=False)
# Build and fit the GridSearchCV
grid = GridSearchCV(estimator=model, param_grid=parameters, scoring=scorer, cv=KFold(n_splits=5))
grid_results = grid.fit(X, y)

best_param=grid_results.cv_results_['params'][grid_results.best_index_]
best_param

{'C': 100, 'epsilon': 1, 'gamma': 'scale', 'kernel': 'rbf'}

## SVR final parameters

In [17]:
svr=svm.SVR(kernel='rbf', epsilon=1, C=100, gamma='scale')

In [18]:
regressor=svr.fit(X_train,y_train)
y_pred=svr.predict(X_test)

In [19]:
from sklearn.metrics import mean_squared_error
# Mean squared error function loss
# The mean_squared_error function computes mean square error, a risk metric corresponding to the expected value of the squared (quadratic) error or loss
error = mean_squared_error(y_test, y_pred)
print('RBF model Test MSE: %.3f' % error)

RBF model Test MSE: 13.586


## Models for each type of day

### Weekdays

In [None]:
for i in range(len(X)):
    if X.index

### Saturdays

### Sundays and Holidays

In [None]:
def check_holiday(df,holi_df):
    
    index_to_drop = []
    count = 0
    for i in df['int_date'].values:
        if i in holi_df['int_date'].values:
            index_to_drop.append(count)
        count+=1
        
    dropped_df = df.drop(index_to_drop)
    
    return dropped_df