# Definitions

In [1]:
from pathlib import Path

from tqdm.notebook import tqdm

import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from prophet import Prophet

In [2]:
data_path = Path('..', 'data')

# Data

In [3]:
train = pd.read_csv(data_path.joinpath('train_data.csv'))

In [4]:
train.head()

Unnamed: 0,id,timestamp,rto_day,traffic,region_nm
0,1,2020-08-13,117135.29,638.0,Аваллонэ
1,1,2020-11-26,116102.2,619.0,Аваллонэ
2,2,2021-01-13,102750.37,461.0,Валимар
3,3,2019-07-03,100223.2,603.0,Ильмарин
4,3,2020-12-10,126296.59,624.0,Ильмарин


In [5]:
train.timestamp = pd.to_datetime(train.timestamp)

# Raw RTO Prophet. model per id

In [6]:
prophet_df = train[['timestamp', 'rto_day']].rename({'timestamp': 'DS', 'rto_day': 'y'}, axis=1)

In [7]:
prophet_df.head()

Unnamed: 0,DS,y
0,2020-08-13,117135.29
1,2020-11-26,116102.2
2,2021-01-13,102750.37
3,2019-07-03,100223.2
4,2020-12-10,126296.59


## Test Routine

In [8]:
tdf = train[train['id'] == 1].sort_values('timestamp')

In [9]:
tdf.shape

(818, 5)

In [10]:
train_tdf = tdf.iloc[:-28]
test_tdf = tdf.iloc[-28:]

In [11]:
model = Prophet()

In [12]:
prophet_tdf = train_tdf[['timestamp',
                        'rto_day']].rename({'timestamp': 'ds',
                                            'rto_day': 'y'}, axis=1)

In [13]:
prophet_tdf.head()

Unnamed: 0,ds,y
3263,2019-01-01,40373.99
217569,2019-01-02,76532.42
707249,2019-01-03,87443.71
574101,2019-01-04,72098.36
561905,2019-01-05,85673.11


In [14]:
model.fit(prophet_tdf)

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<prophet.forecaster.Prophet at 0x7f9aeb797f70>

In [15]:
prophet_test_tdf = test_tdf[['timestamp',
                        'rto_day']].rename({'timestamp': 'ds',
                                            'rto_day': 'y'}, axis=1)

In [16]:
predicts = model.predict(prophet_test_tdf)
predicts.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2021-03-01,135582.398939,137279.973733,173429.23139,135582.398939,135582.398939,19144.221454,19144.221454,19144.221454,9881.577879,9881.577879,9881.577879,9262.643575,9262.643575,9262.643575,0.0,0.0,0.0,154726.620393
1,2021-03-02,135645.794262,122416.014755,161329.842475,135645.794262,135645.794262,6674.975982,6674.975982,6674.975982,-2712.969774,-2712.969774,-2712.969774,9387.945756,9387.945756,9387.945756,0.0,0.0,0.0,142320.770245
2,2021-03-03,135709.189586,125866.543663,164445.035289,135709.189586,135709.189586,9632.449066,9632.449066,9632.449066,120.260635,120.260635,120.260635,9512.188431,9512.188431,9512.188431,0.0,0.0,0.0,145341.638652
3,2021-03-04,135772.58491,125852.032042,164664.877273,135772.58491,135772.58491,9359.153987,9359.153987,9359.153987,-273.505715,-273.505715,-273.505715,9632.659702,9632.659702,9632.659702,0.0,0.0,0.0,145131.738896
4,2021-03-05,135835.980233,134729.202979,172268.933362,135835.980233,135835.980233,17609.748337,17609.748337,17609.748337,7862.633012,7862.633012,7862.633012,9747.115325,9747.115325,9747.115325,0.0,0.0,0.0,153445.72857


In [17]:
validate_tdf = pd.merge(prophet_test_tdf, predicts[['ds', 'yhat']], on='ds')

In [18]:
validate_tdf.shape

(28, 3)

In [19]:
def trunc_to_week(dt_series):
    return pd.PeriodIndex(dt_series, freq='W-Sun')

In [20]:
week_val_tdf = validate_tdf.groupby(trunc_to_week(validate_tdf.ds)).sum()

In [21]:
validate_tdf.ds.groupby(trunc_to_week(validate_tdf.ds)).count() == 7

ds
2021-03-01/2021-03-07    True
2021-03-08/2021-03-14    True
2021-03-15/2021-03-21    True
2021-03-22/2021-03-28    True
Freq: W-SUN, Name: ds, dtype: bool

In [22]:
mean_absolute_error(week_val_tdf.y, week_val_tdf.yhat)

30607.586365015683

## Routine

In [30]:
splitter = TimeSeriesSplit(n_splits=5, test_size=28)

In [31]:
model_dict = {}
for id_, df in tqdm(train.sort_values('timestamp').groupby('id')):
    prophet_df = train[['timestamp',
                        'rto_day']].rename({'timestamp': 'ds',
                                            'rto_day': 'y'}, axis=1).reset_index(drop=True)
    model_mae = []
    for train_ix, test_ix in splitter.split(prophet_df):
        train_df = prophet_df.loc[train_ix]
        test_df = prophet_df.loc[test_ix]
        
        model = Prophet()
        model.fit(train_df)
        
        predicts = model.predict(test_df[['ds']])
        val_df = pd.merge(test_df, predicts[['ds', 'yhat']],
                          on='ds', how='inner')
        week_groups = val_df.groupby(trunc_to_week(val_df.ds))
        week_val_df = week_groups.sum()[week_groups.ds.count() == 7]
        
        model_mae.append(week_val_df['y'], week_val_df['yhat'])
    model_dict[id_] = model_mae

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





TypeError: append() takes exactly one argument (2 given)