In [3]:
import pandas as pd
import numpy as np

from prophet import Prophet

from matplotlib import pyplot

In [13]:
training_data = pandas.read_csv('./submission.csv')
training_data.shape
training_data

Unnamed: 0,date,hour,demand
0,2021-03-01,0,65
1,2021-03-01,1,61
2,2021-03-01,2,58
3,2021-03-01,3,55
4,2021-03-01,5,55
...,...,...,...
7645,2022-03-28,19,75
7646,2022-03-28,20,67
7647,2022-03-28,21,59
7648,2022-03-28,22,55


In [14]:
training_data['date']= pd.to_datetime(training_data['date'])
training_data

Unnamed: 0,date,hour,demand
0,2021-03-01,0,65
1,2021-03-01,1,61
2,2021-03-01,2,58
3,2021-03-01,3,55
4,2021-03-01,5,55
...,...,...,...
7645,2022-03-28,19,75
7646,2022-03-28,20,67
7647,2022-03-28,21,59
7648,2022-03-28,22,55


In [15]:
print(f"starting date : {str(training_data['date'].dt.date.min())}")
print(f"end date : {str(training_data['date'].dt.date.max())}")

starting date : 2021-03-01
end date : 2022-03-28


In [16]:
def dataPreprocessing(dataFrame):
    dataFrame['date'] = pd.to_datetime(dataFrame['date']) + dataFrame['hour'].astype('timedelta64[h]')
    dataFrame.drop(columns=['hour'], axis=1, inplace=True)
    return dataFrame

In [17]:
training_data = dataPreprocessing(training_data)
training_data.head()

Unnamed: 0,date,demand
0,2021-03-01 00:00:00,65
1,2021-03-01 01:00:00,61
2,2021-03-01 02:00:00,58
3,2021-03-01 03:00:00,55
4,2021-03-01 05:00:00,55


In [18]:
import plotly.express as px

In [19]:
fig = px.line(training_data, x='date', y='demand')
fig.update_xaxes(rangeslider_visible=True)
fig.show()

In [42]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot

training_data.rename(columns={'date': 'ds', 'demand': 'y'}, inplace=True)
train_data = training_data.sample(frac=0.8, random_state=10)
validation_data = training_data.drop(train_data.index)

print(f'training data size : {train_data.shape}')
print(f'validation data size : {validation_data.shape}')

train_data = train_data.reset_index()
validation_data = validation_data.reset_index()

training data size : (6120, 2)
validation data size : (1530, 2)


In [53]:
# training_data
# train_data
# validation_data

Unnamed: 0,ds,y
0,2021-03-01 00:00:00,65
1,2021-03-01 01:00:00,61
2,2021-03-01 02:00:00,58
3,2021-03-01 03:00:00,55
4,2021-03-01 05:00:00,55
...,...,...
7645,2022-03-28 19:00:00,75
7646,2022-03-28 20:00:00,67
7647,2022-03-28 21:00:00,59
7648,2022-03-28 22:00:00,55


In [21]:
from sklearn.metrics import mean_absolute_error
from prophet import Prophet

In [22]:
model = Prophet()
model.fit(train_data)

13:17:14 - cmdstanpy - INFO - Chain [1] start processing
13:17:18 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x1d23c2cd610>

In [33]:
pd.DataFrame({'ds':validation_data['ds']})

Unnamed: 0,ds
0,2021-03-01 00:00:00
1,2021-03-01 15:00:00
2,2021-03-02 04:00:00
3,2021-03-02 09:00:00
4,2021-03-02 23:00:00
...,...
1525,2022-03-28 11:00:00
1526,2022-03-28 12:00:00
1527,2022-03-28 16:00:00
1528,2022-03-28 17:00:00


In [23]:
prediction = model.predict(pd.DataFrame({'ds':validation_data['ds']}))
y_actual = validation_data['y']
y_predicted = prediction['yhat']
y_predicted = y_predicted.astype(int)
mean_absolute_error(y_actual, y_predicted)

1.430718954248366

In [64]:
tuple(zip(y_actual.to_numpy(), y_predicted.to_numpy()))

((65, 66),
 (98, 99),
 (49, 49),
 (71, 72),
 (61, 61),
 (51, 51),
 (66, 66),
 (85, 84),
 (96, 96),
 (98, 97),
 (91, 91),
 (84, 83),
 (53, 53),
 (100, 99),
 (78, 77),
 (64, 64),
 (54, 54),
 (72, 71),
 (81, 80),
 (84, 83),
 (104, 103),
 (116, 115),
 (112, 110),
 (79, 78),
 (76, 75),
 (94, 93),
 (85, 83),
 (72, 70),
 (68, 67),
 (58, 57),
 (74, 73),
 (80, 79),
 (98, 96),
 (95, 93),
 (55, 54),
 (51, 50),
 (48, 47),
 (54, 53),
 (57, 56),
 (54, 54),
 (93, 93),
 (96, 95),
 (76, 75),
 (57, 56),
 (54, 53),
 (69, 69),
 (103, 102),
 (85, 84),
 (72, 71),
 (65, 64),
 (107, 106),
 (115, 114),
 (108, 107),
 (86, 85),
 (81, 80),
 (78, 77),
 (68, 68),
 (106, 106),
 (106, 105),
 (81, 80),
 (68, 67),
 (56, 55),
 (90, 90),
 (70, 69),
 (90, 90),
 (47, 46),
 (46, 45),
 (79, 79),
 (89, 88),
 (91, 91),
 (93, 92),
 (93, 92),
 (57, 57),
 (50, 49),
 (47, 47),
 (65, 64),
 (100, 100),
 (69, 69),
 (65, 65),
 (76, 76),
 (83, 82),
 (106, 106),
 (81, 81),
 (73, 73),
 (62, 62),
 (75, 74),
 (101, 101),
 (102, 102),
 (99,

In [25]:
from plotly.subplots import make_subplots

import plotly.graph_objects as go

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(x=validation_data['ds'], y=y_actual, name="actual targets"),
    secondary_y=False,)
fig.add_trace(
    go.Scatter(x=validation_data['ds'], y=y_predicted, name="predicted targets"),
    secondary_y=True,)
fig.update_layout(
    title_text="Actual vs Predicted Targets")
fig.update_xaxes(title_text="Timeline")
fig.update_yaxes(title_text="actual targets", secondary_y=False)
fig.update_yaxes(title_text="predicted targets", secondary_y=True)
fig.show()

In [26]:
test_data = pandas.read_csv('./sample_4E0BhPN.csv')
print(f'test dataset size : {test_data.shape}')
testing_data = dataPreprocessing(test_data.copy())
testing_data.head()

test dataset size : (7650, 3)


Unnamed: 0,date,demand
0,2021-03-01 00:00:00,0
1,2021-03-01 01:00:00,0
2,2021-03-01 02:00:00,0
3,2021-03-01 03:00:00,0
4,2021-03-01 05:00:00,0


In [66]:
test_prediction = model.predict(pd.DataFrame({'ds':testing_data['date']}))

In [31]:
test_prediction = test_prediction['yhat']
test_prediction = test_prediction.astype(int)
test_data['demand'] = test_prediction
test_data.head()

Unnamed: 0,date,hour,demand
0,2021-03-01,0,66
1,2021-03-01,1,63
2,2021-03-01,2,59
3,2021-03-01,3,56
4,2021-03-01,5,56


In [32]:

test_data.to_csv('submission.csv', index=False)