In [1]:
import pandas as pd
from pandas import datetime
from pandas import DataFrame

from pandas import DataFrame
from pandas import concat

from sklearn.preprocessing import MinMaxScaler

  


In [2]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [3]:
# dataset indonesia (jakarta)
case = pd.read_csv('data/jakarta/case_maret_july.csv')
mobility = pd.read_csv('data/jakarta/mobility_maret_july.csv')
weather = pd.read_csv('data/jakarta/weather_maret_july.csv')
case.set_index('date', inplace=True)
#weather.set_index('date', inplace=True)
mobility.set_index('date', inplace=True)

In [4]:
new_weather = weather.interpolate(method='spline', order=2)
new_weather.head()

new_weather.set_index('date', inplace=True)

In [5]:
data = pd.concat([case, new_weather, mobility], axis=1)
data.head()

Unnamed: 0_level_0,case,rh,avt,mxt,mnt,retail_recreation,grocery_pahrmacy,parks,transit_station,workplace,residential
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-03-18,38,86.0,27.6,29.0,25.8,-22,-3,-28,-32,-19,11
2020-03-19,50,83.0,28.1,31.4,25.8,-28,-12,-33,-38,-21,12
2020-03-20,14,82.0,27.4,31.2,25.2,-30,-11,-32,-42,-23,15
2020-03-21,44,81.0,28.1,32.4,26.602196,-35,-5,-38,-40,-10,11
2020-03-22,36,84.0,27.2,31.2,25.0,-49,-22,-59,-49,-12,14


In [6]:
values = data.values
# ensure all data is float
values = values.astype('float32')

In [7]:
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
print(scaled)

[[0.00255864 0.7428571  0.40983582 ... 0.7704917  0.8428571  0.28125   ]
 [0.00383795 0.65714276 0.49180317 ... 0.67213106 0.8142857  0.3125    ]
 [0.         0.6285714  0.37704897 ... 0.6065573  0.78571427 0.40625   ]
 ...
 [0.9868869  0.5142857  0.6393442  ... 0.65573764 0.78571427 0.28125   ]
 [0.99999994 0.5142857  0.62295055 ... 0.6065573  0.8428571  0.3125    ]
 [0.8884861  0.5142857  0.5409837  ... 0.5737704  0.5428571  0.4375    ]]


In [8]:
# frame as supervised learning
reframed = series_to_supervised(scaled, 1, 1)
print(reframed)

     var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  var5(t-1)  var6(t-1)  \
1     0.002559   0.742857   0.409836      0.175   0.560000   0.821429   
2     0.003838   0.657143   0.491803      0.475   0.560000   0.714286   
3     0.000000   0.628571   0.377049      0.450   0.440001   0.678571   
4     0.003198   0.600000   0.491803      0.600   0.720439   0.589286   
5     0.002345   0.685714   0.344262      0.450   0.400000   0.339286   
..         ...        ...        ...        ...        ...        ...   
463   0.498827   0.600000   0.442623      0.400   0.440001   0.714286   
464   0.798614   0.600000   0.360655      0.325   0.400000   0.714286   
465   0.737740   0.514286   0.557377      0.675   0.480000   0.767857   
466   0.986887   0.514286   0.639344      0.625   0.560000   0.642857   
467   1.000000   0.514286   0.622951      0.675   0.680000   0.535714   

     var7(t-1)  var8(t-1)  var9(t-1)  var10(t-1)  ...   var2(t)   var3(t)  \
1     0.651515   0.944444   0.770492    0.8428

In [9]:
# drop columns we don't want to predict
reframed.drop(reframed.columns[[12,13,14,15,16,17,18,19,20,21]], axis=1, inplace=True)
print(reframed.head())

   var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  var5(t-1)  var6(t-1)  \
1   0.002559   0.742857   0.409836      0.175   0.560000   0.821429   
2   0.003838   0.657143   0.491803      0.475   0.560000   0.714286   
3   0.000000   0.628571   0.377049      0.450   0.440001   0.678571   
4   0.003198   0.600000   0.491803      0.600   0.720439   0.589286   
5   0.002345   0.685714   0.344262      0.450   0.400000   0.339286   

   var7(t-1)  var8(t-1)  var9(t-1)  var10(t-1)  var11(t-1)   var1(t)  
1   0.651515   0.944444   0.770492    0.842857     0.28125  0.003838  
2   0.515152   0.875000   0.672131    0.814286     0.31250  0.000000  
3   0.530303   0.888889   0.606557    0.785714     0.40625  0.003198  
4   0.621212   0.805556   0.639344    0.971429     0.28125  0.002345  
5   0.363636   0.513889   0.491803    0.942857     0.37500  0.003945  


In [16]:
# split into train and test sets
values = reframed.values

n_total = values.shape[0]
train = values[:n_total, :]
test = values[n_total:, :]

# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

In [15]:
#values.shape[0]

#n_train_hours = 365 * 24
#train = values[:n_train_hours, :]
#test = values[n_train_hours:, :]

467