# Todo
- [ ] Evaluate the forecast models
   - [x] Build model for ~2018, and test for the first week of 2019
   - [x] Measure the time of fitting
   - [ ] Repeat this procedure for all 2018
      - [x] Extract a week for input
      - [x] Add the input into the training data
      - [x] Forecast next week
      - [ ] Forecast next one day

# Dependency

In [61]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.getLogger('fbprophet').setLevel(logging.WARNING)

import pandas as pd
from fbprophet import Prophet
import time
from tqdm.notebook import tqdm

# Load Training Data

In [4]:
all_df = pd.read_csv('data/train.csv')

USED_CROPS = ['鳳梨-金鑽鳳梨', '香蕉', '青蔥-日蔥', '蓮霧-紅蓮霧', '芽菜類-黃豆牙', '南瓜-木瓜形', '大蒜-蒜仁', '小白菜-土白菜', '木瓜-網室紅肉', '洋香瓜-網狀紅肉']

all_df = all_df.loc[all_df['市場名稱'] == '台北一']
all_df = all_df[all_df['作物名稱'].isin(USED_CROPS)]

grouped_df = all_df.groupby(all_df.作物名稱)

# Build Models

In [6]:
def build_model(df, y_name):
  df = df.rename(columns={y_name: 'y', 'DateTime': 'ds'})
  df['ds'] = pd.to_datetime(df['ds'])
  
  # fit a model
  m = Prophet()
  m.fit(df)
  return m  

In [7]:
df = grouped_df.get_group('香蕉')
df_price = df[['平均價', 'DateTime']]
model = build_model(df_price, '平均價')
future = model.make_future_dataframe(periods=365)
forecast = model.predict(future)

In [14]:
# testing = forecast.loc[forecast['ds'] >= 2019]

testing = forecast[(forecast['ds'] >= '2019-01-01') & (forecast['ds'] < '2019-12-31')]

# Define the date separate points

In [58]:
DATE_POINTS = []
start_date = pd.to_datetime('2019-01-01')
end_date = start_date

while end_date < pd.to_datetime('2020-01-01'):
  start_date = end_date
  end_date = pd.to_datetime(start_date) + pd.DateOffset(days=7)
  DATE_POINTS.append((start_date, end_date))

In [72]:
print(DATE_POINTS[1], DATE_POINTS[2])
test_df = pd.read_csv('data/test.csv')

(Timestamp('2019-01-08 00:00:00'), Timestamp('2019-01-15 00:00:00')) (Timestamp('2019-01-15 00:00:00'), Timestamp('2019-01-22 00:00:00'))


In [73]:
test_df = test_df.loc[test_df['市場名稱'] == '台北一']
test_df = test_df[test_df['作物名稱'].isin(USED_CROPS)]

test_grouped_df = test_df.groupby(test_df.作物名稱)

In [28]:
test_data = test_grouped_df.get_group('香蕉')

## Extract 7 days from the testing data

In [42]:
test_1 = test_data[(pd.to_datetime(test_data['DateTime']) >= DATE_POINTS[1]) & (pd.to_datetime(test_data['DateTime']) < DATE_POINTS[2])]

test_2 = test_data[(pd.to_datetime(test_data['DateTime']) >= DATE_POINTS[2]) & (pd.to_datetime(test_data['DateTime']) < DATE_POINTS[3])]
test_2
# test_2

Unnamed: 0,Year,Month,Day,作物代號,作物名稱,市場代號,市場名稱,平均價,1_day_return,3_day_return,...,C0Z061_StnPres,C0Z061_Temperature,C0Z061_WS,C0Z061_WSGust,C0Z061_Precp,467660_StnPres,467660_Temperature,467660_WS,467660_WSGust,467660_Precp
1780,2019.0,1.0,15.0,A1,香蕉,109,台北一,24.8,0.404858,-6.766917,...,1002.3,19.6,0.3,2.4,0.0,1017.2,21.6,0.9,5.2,12.0
1781,2019.0,1.0,16.0,A1,香蕉,109,台北一,25.9,4.435484,-3.358209,...,1003.4,17.7,1.0,5.7,3.0,1017.3,20.8,1.3,9.3,0.0
1782,2019.0,1.0,17.0,A1,香蕉,109,台北一,26.8,3.474903,8.502024,...,1005.2,17.2,1.0,5.5,0.0,1019.4,20.0,1.6,9.8,0.0
1783,2019.0,1.0,18.0,A1,香蕉,109,台北一,27.8,3.731343,12.096774,...,1007.4,17.1,1.0,6.7,0.0,1022.0,19.6,1.9,10.8,0.0
1784,2019.0,1.0,19.0,A1,香蕉,109,台北一,29.3,5.395683,13.127413,...,1004.5,17.4,0.3,2.1,0.0,1019.5,20.2,0.9,5.7,1.2
1785,2019.0,1.0,20.0,A1,香蕉,109,台北一,33.1,12.969283,23.507463,...,1003.4,17.9,0.8,7.1,4.5,1017.9,20.3,1.5,9.8,3.5


In [43]:
test2_price = test_2[['平均價', 'DateTime']]
new_train = pd.concat([df_price, test2_price])
new_train

Unnamed: 0,平均價,DateTime
10256,18.2,2012-01-01
10257,18.4,2012-01-03
10258,18.9,2012-01-04
10259,18.8,2012-01-05
10260,19.5,2012-01-06
...,...,...
1781,25.9,2019-01-16
1782,26.8,2019-01-17
1783,27.8,2019-01-18
1784,29.3,2019-01-19


In [49]:
new_model = build_model(new_train, '平均價')
future = new_model.make_future_dataframe(periods=7)
forecast = new_model.predict(future)

In [52]:
# forecast[(forecast['ds'] >= '2019-01-01') & (forecast['ds'] < '2019-12-31')]

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
2111,2019-01-15,26.577404,28.763305,43.854709,26.577404,26.577404,9.795154,9.795154,9.795154,0.034953,0.034953,0.034953,9.760201,9.760201,9.760201,0.0,0.0,0.0,36.372557
2112,2019-01-16,26.587449,29.061316,43.307583,26.587449,26.587449,9.741543,9.741543,9.741543,0.100984,0.100984,0.100984,9.640559,9.640559,9.640559,0.0,0.0,0.0,36.328992
2113,2019-01-17,26.597494,28.697393,43.829017,26.597494,26.597494,9.517478,9.517478,9.517478,0.009702,0.009702,0.009702,9.507775,9.507775,9.507775,0.0,0.0,0.0,36.114972
2114,2019-01-18,26.607539,27.710984,42.337767,26.607539,26.607539,8.805994,8.805994,8.805994,-0.558801,-0.558801,-0.558801,9.364795,9.364795,9.364795,0.0,0.0,0.0,35.413533
2115,2019-01-19,26.617584,28.450744,42.656594,26.617584,26.617584,8.994759,8.994759,8.994759,-0.219927,-0.219927,-0.219927,9.214686,9.214686,9.214686,0.0,0.0,0.0,35.612344
2116,2019-01-20,26.62763,28.986299,43.375183,26.62763,26.62763,9.396486,9.396486,9.396486,0.335915,0.335915,0.335915,9.060571,9.060571,9.060571,0.0,0.0,0.0,36.024116
2117,2019-01-21,26.637675,28.213618,42.833673,26.637675,26.637675,9.202725,9.202725,9.202725,0.297173,0.297173,0.297173,8.905552,8.905552,8.905552,0.0,0.0,0.0,35.8404
2118,2019-01-22,26.64772,28.864371,42.420382,26.64772,26.64772,8.787592,8.787592,8.787592,0.034953,0.034953,0.034953,8.752639,8.752639,8.752639,0.0,0.0,0.0,35.435312
2119,2019-01-23,26.657765,28.220944,42.931219,26.657765,26.657765,8.705668,8.705668,8.705668,0.100984,0.100984,0.100984,8.604683,8.604683,8.604683,0.0,0.0,0.0,35.363433
2120,2019-01-24,26.667811,27.789077,42.297501,26.667811,26.667811,8.474016,8.474016,8.474016,0.009702,0.009702,0.009702,8.464314,8.464314,8.464314,0.0,0.0,0.0,35.141827


# Create the 7-day sequence for 2019

In [55]:
INPUTS = []
start_date = pd.to_datetime('2019-01-01')
end_date = start_date

while end_date < pd.to_datetime('2020-01-01'):
  end_date = pd.to_datetime(start_date) + pd.DateOffset(days=7)
  start_date = start_date + pd.DateOffset(days=1)
  INPUTS.append((start_date, end_date))

In [56]:
INPUTS

[(Timestamp('2019-01-02 00:00:00'), Timestamp('2019-01-08 00:00:00')),
 (Timestamp('2019-01-03 00:00:00'), Timestamp('2019-01-09 00:00:00')),
 (Timestamp('2019-01-04 00:00:00'), Timestamp('2019-01-10 00:00:00')),
 (Timestamp('2019-01-05 00:00:00'), Timestamp('2019-01-11 00:00:00')),
 (Timestamp('2019-01-06 00:00:00'), Timestamp('2019-01-12 00:00:00')),
 (Timestamp('2019-01-07 00:00:00'), Timestamp('2019-01-13 00:00:00')),
 (Timestamp('2019-01-08 00:00:00'), Timestamp('2019-01-14 00:00:00')),
 (Timestamp('2019-01-09 00:00:00'), Timestamp('2019-01-15 00:00:00')),
 (Timestamp('2019-01-10 00:00:00'), Timestamp('2019-01-16 00:00:00')),
 (Timestamp('2019-01-11 00:00:00'), Timestamp('2019-01-17 00:00:00')),
 (Timestamp('2019-01-12 00:00:00'), Timestamp('2019-01-18 00:00:00')),
 (Timestamp('2019-01-13 00:00:00'), Timestamp('2019-01-19 00:00:00')),
 (Timestamp('2019-01-14 00:00:00'), Timestamp('2019-01-20 00:00:00')),
 (Timestamp('2019-01-15 00:00:00'), Timestamp('2019-01-21 00:00:00')),
 (Time

# Predict the whole year (output = week)

In [76]:
START = 0
END = 1

prediction = []
pbar = tqdm(total=len(DATE_POINTS))

for d in DATE_POINTS:
  input_data = test_data[(pd.to_datetime(test_data['DateTime']) >= d[START]) & \
                         (pd.to_datetime(test_data['DateTime']) < d[END])]
  # add to the training data
  input_price = input_data[['平均價', 'DateTime']]
  new_train = pd.concat([df_price, input_price])
  model = build_model(new_train, '平均價')
  future = model.make_future_dataframe(periods=7)
  forecast = model.predict(future)
  
  # extract the predict result
  forecast = forecast[(forecast['ds'] >= d[END])]
  prediction.append(forecast)
  
  pbar.update(1)
pbar.close()

total_result = pd.concat(prediction)
total_result.to_csv('result/banana_week_pred.csv', index = False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=53.0), HTML(value='')))




In [68]:
total_result = pd.concat(prediction)
total_result

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
2118,2019-01-08,26.55394,28.81027,44.050627,26.55394,26.55394,9.957155,9.957155,9.957155,0.03306,0.03306,0.03306,9.924094,9.924094,9.924094,0.0,0.0,0.0,36.511094
2119,2019-01-09,26.563892,29.286009,43.329236,26.563892,26.563892,10.105837,10.105837,10.105837,0.09232,0.09232,0.09232,10.013517,10.013517,10.013517,0.0,0.0,0.0,36.669729
2120,2019-01-10,26.573845,29.575453,43.754046,26.573845,26.573845,10.077558,10.077558,10.077558,-0.001134,-0.001134,-0.001134,10.078692,10.078692,10.078692,0.0,0.0,0.0,36.651403
2121,2019-01-11,26.583797,28.895516,43.462907,26.583797,26.583797,9.544963,9.544963,9.544963,-0.574037,-0.574037,-0.574037,10.119,10.119,10.119,0.0,0.0,0.0,36.128761
2122,2019-01-12,26.59375,29.352972,43.789565,26.59375,26.59375,9.902239,9.902239,9.902239,-0.232056,-0.232056,-0.232056,10.134295,10.134295,10.134295,0.0,0.0,0.0,36.495989
2123,2019-01-13,26.603702,29.744341,44.204376,26.603702,26.603702,10.448987,10.448987,10.448987,0.324082,0.324082,0.324082,10.124905,10.124905,10.124905,0.0,0.0,0.0,37.052689
2118,2019-01-15,26.351486,28.886536,43.152893,26.351486,26.351486,9.991346,9.991346,9.991346,0.055413,0.055413,0.055413,9.935933,9.935933,9.935933,0.0,0.0,0.0,36.342832
2119,2019-01-16,26.360417,29.210233,43.468312,26.360417,26.360417,9.949047,9.949047,9.949047,0.104051,0.104051,0.104051,9.844997,9.844997,9.844997,0.0,0.0,0.0,36.309464
2120,2019-01-17,26.369347,29.130983,43.892794,26.369347,26.369347,9.750975,9.750975,9.750975,0.011334,0.011334,0.011334,9.739641,9.739641,9.739641,0.0,0.0,0.0,36.120322
2121,2019-01-18,26.378277,27.409152,43.306836,26.378277,26.378277,9.062797,9.062797,9.062797,-0.559526,-0.559526,-0.559526,9.622323,9.622323,9.622323,0.0,0.0,0.0,35.441074


In [75]:
for r in test_df[(test_df['DateTime'] <= '2019-01-31')]['DateTime']:
  print(r)

2019-01-01
2019-01-02
2019-01-03
2019-01-04
2019-01-05
2019-01-06
2019-01-08
2019-01-09
2019-01-10
2019-01-11
2019-01-12
2019-01-13
2019-01-15
2019-01-16
2019-01-17
2019-01-18
2019-01-19
2019-01-20
2019-01-22
2019-01-23
2019-01-24
2019-01-25
2019-01-26
2019-01-27
2019-01-29
2019-01-30
2019-01-31
2019-01-01
2019-01-02
2019-01-03
2019-01-04
2019-01-05
2019-01-06
2019-01-08
2019-01-09
2019-01-10
2019-01-11
2019-01-12
2019-01-13
2019-01-15
2019-01-16
2019-01-17
2019-01-18
2019-01-19
2019-01-20
2019-01-22
2019-01-23
2019-01-24
2019-01-25
2019-01-26
2019-01-27
2019-01-29
2019-01-30
2019-01-31
2019-01-01
2019-01-02
2019-01-03
2019-01-04
2019-01-05
2019-01-06
2019-01-08
2019-01-09
2019-01-10
2019-01-11
2019-01-12
2019-01-13
2019-01-15
2019-01-16
2019-01-17
2019-01-18
2019-01-19
2019-01-20
2019-01-22
2019-01-23
2019-01-24
2019-01-25
2019-01-26
2019-01-27
2019-01-29
2019-01-30
2019-01-31
2019-01-01
2019-01-02
2019-01-03
2019-01-04
2019-01-05
2019-01-06
2019-01-08
2019-01-09
2019-01-10
2019-01-11