In [1]:
import pandas as pd

In [2]:
features = ["date","hour","flow","anomaly"]
df = pd.read_csv('train_barreiro.csv', sep=';', names=features)

In [3]:
df.drop(['anomaly'], axis='columns', inplace=True)
df.head()

Unnamed: 0,date,hour,flow
0,03/05/2018,00:07:30,13.026578
1,03/05/2018,00:22:30,10.898906
2,03/05/2018,00:37:30,11.048772
3,03/05/2018,00:52:30,10.641706
4,03/05/2018,01:07:30,10.452578


# Formatting into time-delayed vectors (sparse format)

## Lags of time series

Lag features are the classical way that time series forecasting problems are transformed into supervised learning problems.

The simplest approach is to predict the value at the next time (t+1) given the value at the previous time (t-1). The supervised learning problem with shifted values

### 10 previous readings

In [4]:
# in order to construct a new dataframe with D lagged values
df_lagged = df.copy()
number_lags = 10 # this is D

for i in range(1, number_lags+1):
    shifted = df['flow'].shift(i)
    df_lagged = pd.concat((df_lagged, shifted), axis=1)

In [5]:
# just to label the columns of the resulting dataframe
lagged_cols=["n-"+ str(x) for x in range(1,number_lags+1)]
colnames=["hour","n"]+lagged_cols
df_lagged.columns=colnames
df_lagged.head()

Unnamed: 0,date,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
0,03/05/2018,00:07:30,13.026578,,,,,,,,,,
1,03/05/2018,00:22:30,10.898906,13.026578,,,,,,,,,
2,03/05/2018,00:37:30,11.048772,10.898906,13.026578,,,,,,,,
3,03/05/2018,00:52:30,10.641706,11.048772,10.898906,13.026578,,,,,,,
4,03/05/2018,01:07:30,10.452578,10.641706,11.048772,10.898906,13.026578,,,,,,


In [6]:
pd.set_option("display.max_rows", 200)
df_lagged.head(200)

Unnamed: 0,date,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
0,03/05/2018,00:07:30,13.026578,,,,,,,,,,
1,03/05/2018,00:22:30,10.898906,13.026578,,,,,,,,,
2,03/05/2018,00:37:30,11.048772,10.898906,13.026578,,,,,,,,
3,03/05/2018,00:52:30,10.641706,11.048772,10.898906,13.026578,,,,,,,
4,03/05/2018,01:07:30,10.452578,10.641706,11.048772,10.898906,13.026578,,,,,,
5,03/05/2018,01:22:30,8.923089,10.452578,10.641706,11.048772,10.898906,13.026578,,,,,
6,03/05/2018,01:37:30,8.638289,8.923089,10.452578,10.641706,11.048772,10.898906,13.026578,,,,
7,03/05/2018,01:52:30,8.334467,8.638289,8.923089,10.452578,10.641706,11.048772,10.898906,13.026578,,,
8,03/05/2018,02:07:30,8.100783,8.334467,8.638289,8.923089,10.452578,10.641706,11.048772,10.898906,13.026578,,
9,03/05/2018,02:22:30,8.70625,8.100783,8.334467,8.638289,8.923089,10.452578,10.641706,11.048772,10.898906,13.026578,


In [7]:
h=input(str("When do you want to predict the water flow rate? "))
lag = df_lagged[df_lagged['hour']==h]
lag

When do you want to predict the water flow rate? 12:07:30


Unnamed: 0,date,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
48,03/05/2018,12:07:30,38.419906,30.266017,29.420528,30.820122,38.77525,31.581917,26.850378,28.529311,32.733603,33.166111,33.733603
144,04/05/2018,12:07:30,60.155451,47.535539,44.081078,44.436072,45.897733,38.435728,35.549272,41.243911,40.015867,38.657361,39.014028
240,07/05/2018,12:07:30,49.953589,49.169039,46.32685,58.8461,58.607078,46.50185,41.541106,43.672839,42.071861,42.651317,35.204901
336,08/05/2018,12:07:30,48.079589,43.858278,49.31595,52.917706,51.839314,45.278706,40.6984,34.556633,29.286794,32.427022,29.559583
432,09/05/2018,12:07:30,31.548572,31.162772,29.956494,29.507256,31.474044,32.101822,37.624967,31.390272,34.497472,29.638656,30.654917


### 10 previous for specific hour in a function

In [8]:
# function that receives the dataset and the time used -> intented
def previous(data,h):
    df_lagged = data.copy()
    number_lags = 10 # this is D
    hour = str(h)
    # in order to construct a new dataframe with D lagged values
    for i in range(1, number_lags+1):
        # shift the flow column
        shifted = data['flow'].shift(i)
        # resulting dataframe
        df_lagged = pd.concat((df_lagged, shifted), axis=1)
        # just to label the columns of the resulting dataframe
    lagged_cols=["n-"+ str(x) for x in range(1,number_lags+1)]
    colnames=["date","hour","n"]+lagged_cols
    df_lagged.columns=colnames
    # filtering by an hour
    lag = df_lagged[df_lagged['hour']==hour]
    
    return lag

In [9]:
series = previous(df,'12:07:30')

In [10]:
series

Unnamed: 0,date,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
48,03/05/2018,12:07:30,38.419906,30.266017,29.420528,30.820122,38.77525,31.581917,26.850378,28.529311,32.733603,33.166111,33.733603
144,04/05/2018,12:07:30,60.155451,47.535539,44.081078,44.436072,45.897733,38.435728,35.549272,41.243911,40.015867,38.657361,39.014028
240,07/05/2018,12:07:30,49.953589,49.169039,46.32685,58.8461,58.607078,46.50185,41.541106,43.672839,42.071861,42.651317,35.204901
336,08/05/2018,12:07:30,48.079589,43.858278,49.31595,52.917706,51.839314,45.278706,40.6984,34.556633,29.286794,32.427022,29.559583
432,09/05/2018,12:07:30,31.548572,31.162772,29.956494,29.507256,31.474044,32.101822,37.624967,31.390272,34.497472,29.638656,30.654917


### Converts hour in indexes

In [7]:
# to add a new column called time
# df_lagged['time']=df_lagged['hour'].copy()

In [31]:
# in order to identify the time as indexes from 0 to 95
indexed=0
number_rows=96 # this is the number of rows per day

for i in range(1, number_rows):
    df_lagged['time']=df_lagged['time'].replace({i:indexed+1})
#    reindexed=df_lagged['time']
#    df_lagged=df_lagged.append(reindexed)

In [32]:
df_lagged

Unnamed: 0,date,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,...,hour.1,hour.2,hour.3,hour.4,hour.5,hour.6,hour.7,hour.8,hour.9,hour.10
0,03/05/2018,00:07:30,13.026578,,,,,,,,...,00:07:30,00:07:30,00:07:30,00:07:30,00:07:30,00:07:30,00:07:30,00:07:30,00:07:30,00:07:30
1,03/05/2018,00:22:30,10.898906,13.026578,,,,,,,...,00:22:30,00:22:30,00:22:30,00:22:30,00:22:30,00:22:30,00:22:30,00:22:30,00:22:30,00:22:30
2,03/05/2018,00:37:30,11.048772,10.898906,13.026578,,,,,,...,00:37:30,00:37:30,00:37:30,00:37:30,00:37:30,00:37:30,00:37:30,00:37:30,00:37:30,00:37:30
3,03/05/2018,00:52:30,10.641706,11.048772,10.898906,13.026578,,,,,...,00:52:30,00:52:30,00:52:30,00:52:30,00:52:30,00:52:30,00:52:30,00:52:30,00:52:30,00:52:30
4,03/05/2018,01:07:30,10.452578,10.641706,11.048772,10.898906,13.026578,,,,...,01:07:30,01:07:30,01:07:30,01:07:30,01:07:30,01:07:30,01:07:30,01:07:30,01:07:30,01:07:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,09/05/2018,22:52:30,22.357783,23.667603,25.941972,28.309133,30.982689,32.801550,36.618228,37.464811,...,22:52:30,22:52:30,22:52:30,22:52:30,22:52:30,22:52:30,22:52:30,22:52:30,22:52:30,22:52:30
476,09/05/2018,23:07:30,19.707922,22.357783,23.667603,25.941972,28.309133,30.982689,32.801550,36.618228,...,23:07:30,23:07:30,23:07:30,23:07:30,23:07:30,23:07:30,23:07:30,23:07:30,23:07:30,23:07:30
477,09/05/2018,23:22:30,19.546844,19.707922,22.357783,23.667603,25.941972,28.309133,30.982689,32.801550,...,23:22:30,23:22:30,23:22:30,23:22:30,23:22:30,23:22:30,23:22:30,23:22:30,23:22:30,23:22:30
478,09/05/2018,23:37:30,21.068406,19.546844,19.707922,22.357783,23.667603,25.941972,28.309133,30.982689,...,23:37:30,23:37:30,23:37:30,23:37:30,23:37:30,23:37:30,23:37:30,23:37:30,23:37:30,23:37:30


In [7]:
def reindex(df):
    """Name the time as indexes from 0 to 95"""
    column=df['time']
    for i in range (len(column)):
        #

df_lagged['time'] = df_lagged.['hour'].apply(reindex)
df.set_index('hour', drop=True, inplace=True)

AttributeError: 'RangeIndex' object has no attribute 'apply'

In [None]:
indexes={'00:07:30':0, '00:22:30':1, '00:37:30':2, '00:52:30':3,
         '01:07:30':4, '01:22:30':5, '01:37:30':6, '01:52:30':7,
         '02:07:30':8, '02:22:30':9, '02:37:30':10, '02:52:30':11,
         '03:07:30':12, '03:22:30':13, '03:37:30':14, '03:52:30':15,
         '04:07:30':16, '04:22:30':17, '04:37:30':18, '04:52:30':19,
         '05:07:30':20, '05:22:30':21, '05:37:30':22, '05:52:30':23,
         '06:07:30':24, '06:22:30':25, '06:37:30':26, '06:52:30':27,
         '07:07:30':28, '07:22:30':29, '07:37:30':30, '07:52:30':32,
         '08:07:30':32, '08:22:30':33, '08:37:30':34, '08:52:30':35,
         '09:07:30':36, '09:22:30':37, '09:37:30':38, '09:52:30':39,
         '10:07:30':40, '10:22:30':41, '10:37:30':42, '10:52:30':43,
         '11:07:30':44, '11:22:30':45, '11:37:30':46, '11:52:30':47,
         '12:07:30':48, '12:22:30':49, '12:37:30':50, '12:52:30':51,
         '13:07:30':52, '13:22:30':53, '13:37:30':54, '13:52:30':55,
         '14:07:30':56, '14:22:30':57, '14:37:30':58, '14:52:30':59,
         '15:07:30':60, '15:22:30':61, '15:37:30':62, '15:52:30':63,
         '16:07:30':64, '16:22:30':65, '16:37:30':66, '16:52:30':67,
         '17:07:30':68, '17:22:30':69, '17:37:30':70, '17:52:30':71,
         '18:07:30':72, '18:22:30':73, '18:37:30':74, '18:52:30':75,
         '19:07:30':76, '19:22:30':77, '19:37:30':78, '19:52:30':79,
         '20:07:30':80, '20:22:30':81, '20:37:30':82, '20:52:30':83,
         '21:07:30':84, '21:22:30':85, '21:37:30':86, '21:52:30':87,
         '22:07:30':88, '22:22:30':89, '22:37:30':90, '22:52:30':91,
         '23:07:30':92, '23:22:30':93, '23:37:30':94, '23:52:30':95}

In [None]:
df_lagged.Gender = [indexes[item] for item in df_lagged['hour']]

### Pass date to index

In [11]:
# due to errors, we passed the date as an index of the dataset
df_lagged['date'] = pd.to_datetime(df_lagged['date'], dayfirst=True).dt.strftime('%Y-%m-%d').astype(str)
df_lagged['date'] = pd.DatetimeIndex(data=df_lagged['date'], dtype='datetime64[ns]', name='date', freq=None)
df_lagged = df_lagged.set_index('date')
df_lagged.head()

Unnamed: 0_level_0,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-05-03,00:07:30,13.026578,,,,,,,,,,
2018-05-03,00:22:30,10.898906,13.026578,,,,,,,,,
2018-05-03,00:37:30,11.048772,10.898906,13.026578,,,,,,,,
2018-05-03,00:52:30,10.641706,11.048772,10.898906,13.026578,,,,,,,
2018-05-03,01:07:30,10.452578,10.641706,11.048772,10.898906,13.026578,,,,,,


### Drop the first day of the dataset

In [12]:
# in order to drop the first 96 rows
df_final = df_lagged.iloc[96:, :]

In [13]:
df_final

Unnamed: 0_level_0,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-05-04,00:07:30,18.588483,20.624694,24.455350,25.895344,27.544022,27.512039,26.801667,29.252894,32.012006,32.764283,33.272778
2018-05-04,00:22:30,16.416578,18.588483,20.624694,24.455350,25.895344,27.544022,27.512039,26.801667,29.252894,32.012006,32.764283
2018-05-04,00:37:30,16.795065,16.416578,18.588483,20.624694,24.455350,25.895344,27.544022,27.512039,26.801667,29.252894,32.012006
2018-05-04,00:52:30,16.106378,16.795065,16.416578,18.588483,20.624694,24.455350,25.895344,27.544022,27.512039,26.801667,29.252894
2018-05-04,01:07:30,15.254000,16.106378,16.795065,16.416578,18.588483,20.624694,24.455350,25.895344,27.544022,27.512039,26.801667
...,...,...,...,...,...,...,...,...,...,...,...,...
2018-05-09,22:52:30,22.357783,23.667603,25.941972,28.309133,30.982689,32.801550,36.618228,37.464811,38.432406,44.922167,49.984489
2018-05-09,23:07:30,19.707922,22.357783,23.667603,25.941972,28.309133,30.982689,32.801550,36.618228,37.464811,38.432406,44.922167
2018-05-09,23:22:30,19.546844,19.707922,22.357783,23.667603,25.941972,28.309133,30.982689,32.801550,36.618228,37.464811,38.432406
2018-05-09,23:37:30,21.068406,19.546844,19.707922,22.357783,23.667603,25.941972,28.309133,30.982689,32.801550,36.618228,37.464811


### Selecting data

In [14]:
# to extract only the lagged values for a specific time of the day
X=df_final[df_final['hour']=='09:07:30']
X

Unnamed: 0_level_0,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-05-04,09:07:30,38.640146,34.401844,35.339883,39.506772,38.056217,38.643208,42.670367,44.017844,37.750839,30.385367,28.096978
2018-05-07,09:07:30,39.309483,40.080767,35.587478,39.772717,39.775383,43.426706,46.467833,48.494628,42.877585,31.664367,28.343117
2018-05-08,09:07:30,35.662872,33.419367,35.956989,34.801211,35.161256,43.833878,42.735139,48.156878,39.796244,34.504222,32.683117
2018-05-09,09:07:30,28.877554,29.660028,32.889611,44.975628,45.867239,55.893061,52.920394,49.542267,43.058928,38.191417,37.78115


## Train test Split

In [15]:
# X is the data to predict 9h
y = X['n'].copy()
X = X.drop(['n'], axis = 1)

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4)

In [None]:
# TIME SERIES SPLIT
# Provides train/test indices to split time series data samples that are observed at fixed time intervals, in train/test sets.
# In each split, test indices must be higher than before, and thus shuffling in cross validator is inappropriate.
# This cross-validation object is a variation of KFold. In the kth split, it returns first k folds as train set and the (k+1)th fold as test set.
#import numpy as np
#from sklearn.model_selection import TimeSeriesSplit
#tscv = TimeSeriesSplit(n_splits=1, test_size=2)
#tscv
#for train, test in tscv.split(X):
#    print("TRAIN:", train, "TEST:", test)
#    X_train, X_test = X[train], X[test]
#    y_train, y_test = y[train], y[test]

# SVM training and testing

In [17]:
import matplotlib.pyplot as plt
from sklearn import svm
svr_rbf = svm.SVR(kernel='rbf', epsilon=0.5, C=99, gamma=1)
# gamma='scale'
# tol=0.5
# C=99

# epsilon_val=10

In [18]:
svr_rbf.fit(X_train,y_train)
y_pred=svr_rbf.predict(X_test)

ValueError: could not convert string to float: '09:07:30'