In [1]:
import pandas as pd

In [2]:
features = ["date","hour","flow","anomaly"]
df = pd.read_csv('train_barreiro_1.csv', sep=';', names=features)

In [3]:
df.drop(['anomaly'], axis='columns', inplace=True)
df.head()

Unnamed: 0,date,hour,flow
0,03/05/2018,00:07:30,13.026578
1,03/05/2018,00:22:30,10.898906
2,03/05/2018,00:37:30,11.048772
3,03/05/2018,00:52:30,10.641706
4,03/05/2018,01:07:30,10.452578


In [4]:
df.head(50)

Unnamed: 0,date,hour,flow
0,03/05/2018,00:07:30,13.026578
1,03/05/2018,00:22:30,10.898906
2,03/05/2018,00:37:30,11.048772
3,03/05/2018,00:52:30,10.641706
4,03/05/2018,01:07:30,10.452578
5,03/05/2018,01:22:30,8.923089
6,03/05/2018,01:37:30,8.638289
7,03/05/2018,01:52:30,8.334467
8,03/05/2018,02:07:30,8.100783
9,03/05/2018,02:22:30,8.70625


## Lags of time series

Lag features are the classical way that time series forecasting problems are transformed into supervised learning problems.

The simplest approach is to predict the value at the next time (t+1) given the value at the previous time (t-1). The supervised learning problem with shifted values

### 10 previous readings

In [7]:
# in order to construct a new dataframe with D lagged values
df_lagged = df.copy()
number_lags = 10 # this is D

for i in range(1, number_lags+1):
    shifted = df['flow'].shift(i)
    df_lagged = pd.concat((df_lagged, shifted), axis=1)

In [8]:
# just to label the columns of the resulting dataframe
lagged_cols=["n-"+ str(x) for x in range(1,number_lags+1)]
colnames=["date","hour","n"]+lagged_cols
df_lagged.columns=colnames
df_lagged.head()

Unnamed: 0,date,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
0,03/05/2018,00:07:30,13.026578,,,,,,,,,,
1,03/05/2018,00:22:30,10.898906,13.026578,,,,,,,,,
2,03/05/2018,00:37:30,11.048772,10.898906,13.026578,,,,,,,,
3,03/05/2018,00:52:30,10.641706,11.048772,10.898906,13.026578,,,,,,,
4,03/05/2018,01:07:30,10.452578,10.641706,11.048772,10.898906,13.026578,,,,,,


In [9]:
# to extract only the lagged values for a specific time of the day
X=df_lagged[df_lagged['hour']=='09:07:30']
X

Unnamed: 0,date,hour,n,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
36,03/05/2018,09:07:30,33.45705,29.906861,27.215406,31.20935,29.761233,36.391,37.241356,37.920778,38.466733,25.987294,22.72635
132,04/05/2018,09:07:30,38.640146,34.401844,35.339883,39.506772,38.056217,38.643208,42.670367,44.017844,37.750839,30.385367,28.096978
228,07/05/2018,09:07:30,39.309483,40.080767,35.587478,39.772717,39.775383,43.426706,46.467833,48.494628,42.877585,31.664367,28.343117
324,08/05/2018,09:07:30,35.662872,33.419367,35.956989,34.801211,35.161256,43.833878,42.735139,48.156878,39.796244,34.504222,32.683117
420,09/05/2018,09:07:30,28.877554,29.660028,32.889611,44.975628,45.867239,55.893061,52.920394,49.542267,43.058928,38.191417,37.78115


In [None]:
hour=input(str("Do you want to predict the water flow rate at what time?"))
lag = df_lagged[df_lagged['hour'==hour]]
lag

In [None]:
# function that receives the dataset and the time used -> intented
def previous(data,h):
    lagged = data.copy()
    number_lags = 10 # this is D
    hour = h
    # in order to construct a new dataframe with D lagged values
    for i in range(1, number_lags+1):
        # shift the flow column
        shifted = lagged['flow'].shift(i)
        # resulting dataframe
        lagged = pd.concat((lagged, shifted), axis=1)
        # just to label the columns of the resulting dataframe
        lagged_cols=["n-"+ str(x) for x in range(1,number_lags+1)]
        col_names=["date","hour","n"]+lagged_cols
        lagged.columns=col_names
        
    lag = lagged[lagged['hour'==hour]]
    
    return lag

In [10]:
series = previous(df, '09:07:30')

ValueError: Length mismatch: Expected axis has 4 elements, new values have 13 elements

In [30]:
#in fact, column n will be the target variable; therefore
y=X['n'].copy()
y

36     33.457050
132    38.640146
228    39.309483
324    35.662872
420    28.877554
Name: n, dtype: float64

In [31]:
X.drop(['n'], axis = 1)

Unnamed: 0,date,hour,n-1,n-2,n-3,n-4,n-5,n-6,n-7,n-8,n-9,n-10
36,03/05/2018,09:07:30,29.906861,27.215406,31.20935,29.761233,36.391,37.241356,37.920778,38.466733,25.987294,22.72635
132,04/05/2018,09:07:30,34.401844,35.339883,39.506772,38.056217,38.643208,42.670367,44.017844,37.750839,30.385367,28.096978
228,07/05/2018,09:07:30,40.080767,35.587478,39.772717,39.775383,43.426706,46.467833,48.494628,42.877585,31.664367,28.343117
324,08/05/2018,09:07:30,33.419367,35.956989,34.801211,35.161256,43.833878,42.735139,48.156878,39.796244,34.504222,32.683117
420,09/05/2018,09:07:30,29.660028,32.889611,44.975628,45.867239,55.893061,52.920394,49.542267,43.058928,38.191417,37.78115
