In [2]:
import pandas as pd 

In [3]:
data = pd.read_csv('data/jobathon-april-2022/train_E1GspfA.csv')
print(data.shape)
data.head()

(18247, 3)


Unnamed: 0,date,hour,demand
0,2018-08-18,9,91
1,2018-08-18,10,21
2,2018-08-18,13,23
3,2018-08-18,14,104
4,2018-08-18,15,81


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18247 entries, 0 to 18246
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    18247 non-null  object
 1   hour    18247 non-null  int64 
 2   demand  18247 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 427.8+ KB


In [5]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df['date'] = pd.to_datetime(df['date']) + df['hour'].astype('timedelta64[h]')
    df.drop('hour', axis=1, inplace=True)
    return df 

In [6]:
data = preprocess(data)
data.head()

Unnamed: 0,date,demand
0,2018-08-18 09:00:00,91
1,2018-08-18 10:00:00,21
2,2018-08-18 13:00:00,23
3,2018-08-18 14:00:00,104
4,2018-08-18 15:00:00,81


In [7]:
import plotly.express as px

fig = px.line(data, x='date', y='demand')
fig.show() 

In [8]:
data.head()

Unnamed: 0,date,demand
0,2018-08-18 09:00:00,91
1,2018-08-18 10:00:00,21
2,2018-08-18 13:00:00,23
3,2018-08-18 14:00:00,104
4,2018-08-18 15:00:00,81


In [9]:
# start_date = pd.to_datetime('2020-11-26')
start_date = pd.to_datetime('26/11/2020 22:00:00', format='%d/%m/%Y %H:%M:%S')
end_date = pd.to_datetime('2020-12-27')

data[(data['date'] >= start_date) & (data['date'] <= end_date)]

Unnamed: 0,date,demand
16826,2020-11-26 22:00:00,15
16827,2020-11-26 23:00:00,18
16828,2020-12-27 00:00:00,71


In [10]:
# filter for demand btw 90 and 100
# data[(data['demand'] >= 90) & (data['demand'] <= 100)]

In [11]:
from prophet import Prophet

In [12]:
data.rename(columns={'date': 'ds', 'demand': 'y'}, inplace=True)

data.head()

Unnamed: 0,ds,y
0,2018-08-18 09:00:00,91
1,2018-08-18 10:00:00,21
2,2018-08-18 13:00:00,23
3,2018-08-18 14:00:00,104
4,2018-08-18 15:00:00,81


In [15]:
data['ds'].dt.year.unique()

array([2018, 2019, 2020, 2021], dtype=int32)

In [17]:
data[data['ds'].dt.year.isin([2018, 2021])]

Unnamed: 0,ds,y
0,2018-08-18 09:00:00,91
1,2018-08-18 10:00:00,21
2,2018-08-18 13:00:00,23
3,2018-08-18 14:00:00,104
4,2018-08-18 15:00:00,81
...,...,...
18242,2021-02-28 19:00:00,95
18243,2021-02-28 20:00:00,88
18244,2021-02-28 21:00:00,39
18245,2021-02-28 22:00:00,104


In [18]:
valid_data = data[data['ds'].dt.year == 2021]
train_data = data[data['ds'].dt.year != 2021]

print(f'Train data shape: {train_data.shape}')
print(f'Validation data shape: {valid_data.shape}')

Train data shape: (16942, 2)
Validation data shape: (1305, 2)


In [19]:
model = Prophet()
model.fit(train_data)

21:56:25 - cmdstanpy - INFO - Chain [1] start processing
21:56:38 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x7f1101b52810>

In [20]:
train_data.tail()

Unnamed: 0,ds,y
16937,2020-12-31 19:00:00,127
16938,2020-12-31 20:00:00,42
16939,2020-12-31 21:00:00,104
16940,2020-12-31 22:00:00,24
16941,2020-12-31 23:00:00,79


In [23]:
model.make_future_dataframe(periods=365).tail()

Unnamed: 0,ds
17302,2021-12-27 23:00:00
17303,2021-12-28 23:00:00
17304,2021-12-29 23:00:00
17305,2021-12-30 23:00:00
17306,2021-12-31 23:00:00


In [26]:
forecast = model.predict(valid_data)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
1300,2021-02-28 19:00:00,92.161494,43.414293,136.449631
1301,2021-02-28 20:00:00,83.959386,38.314903,133.748581
1302,2021-02-28 21:00:00,76.920902,36.627315,122.771418
1303,2021-02-28 22:00:00,72.121358,25.530703,117.849059
1304,2021-02-28 23:00:00,68.994494,24.857664,113.351456


In [27]:
forecast.shape

(1305, 22)

In [28]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(valid_data['y'], forecast['yhat'])

27.797844057295208

In [30]:
from prophet.plot import plot_plotly, plot_components_plotly


In [31]:
plot_plotly(model, forecast)

In [37]:
train_df = train_data.copy()
valid_df = valid_data.copy()


In [41]:
valid_df.set_index('ds', inplace=True)
train_df.set_index('ds', inplace=True)

In [42]:
train_df.head()

Unnamed: 0_level_0,y
ds,Unnamed: 1_level_1
2018-08-18 09:00:00,91
2018-08-18 10:00:00,21
2018-08-18 13:00:00,23
2018-08-18 14:00:00,104
2018-08-18 15:00:00,81


In [74]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def forecast_plot(df1, df2, pred, names, model_name):
    fig = make_subplots(
        rows=1, cols=1, vertical_spacing=0.05
    )

    fig.add_trace(
        go.Scatter(
            name=names[0],
            x=df1.index,
            y=df1["y"],
            marker=dict(size=10, color="blue"),
            textfont=dict(color="black", size=18, family="Times New Roman"),
        )
    )

    fig.add_trace(
        go.Scatter(
            name=names[1], x=df2.index, y=df2["y"], marker=dict(size=10, color="red")
        )
    )

    fig.add_trace(
        go.Scatter(
            name=names[2], x=df2.index, y=pred, marker=dict(size=10, color="green")
        )
    )

    fig.update_xaxes(
        rangeselector=dict(
            buttons=list(
                [
                    dict(count=1, label="1m", step="month", stepmode="backward"),
                    dict(count=6, label="6m", step="month", stepmode="backward"),
                    dict(count=1, label="YTD", step="year", stepmode="todate"),
                    dict(count=1, label="1y", step="year", stepmode="backward"),
                    dict(step="all"),
                ]
            )
        )
    )
    fig.update_layout(
        title={
            "text": f"{model_name} Model Plot",
            "y": 0.98,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
            "font": {"size": 15},
        }
    )
    fig.update_layout(
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    names = set()
    fig.for_each_trace(
        lambda trace: trace.update(showlegend=False)
        if (trace.name in names)
        else names.add(trace.name)
    )

    fig.show()


In [75]:
forecast_plot(train_df, valid_df, forecast['yhat'], ['Train', 'Valid', 'Predict'], 'Prophet') 