In [1]:
import contextlib
from datetime import datetime
from fastai import *
from fastai.tabular import *
import io
import pandas as pd
from pathlib import Path
import random

In [2]:
DATA_PATH = Path('../../data/interim')
path = Path('../../models/')

In [6]:
test_raw = pd.read_csv(DATA_PATH/'test_data.csv', low_memory=False)
test = test_raw.copy()

In [7]:
# Drop week_start and day_of_week as they'll get taken into account by add_datepart() for date
test.drop('week_start', axis='columns', inplace=True)
test.drop('day_of_week', axis='columns', inplace=True)

# Drop any rows where sales = 0 since we don't need to forecast those
test = test[test.sales != 0]

# Run tabular.add_datepart() on both dfs to add columns about the date: ['Year', 'Month', 'Week', 'Day', 'Dayofweek',
# 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
tabular.add_datepart(test, 'date', drop=True, time=False)

In [13]:
#(test == test.sort_values(by=['Elapsed', 'store'])).all().all()
#test.head()
#test.info()
#test.sort_values(by=['Elapsed', 'store']).head()
test.head()

Unnamed: 0,store,state,max_temperature_c,mean_temperature_c,min_temperature_c,dew_point_c,mean_dew_point_c,min_dew_point_c,max_humidity,mean_humidity,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,1,HE,,,,,,,,,...,20,5,171,False,False,False,False,False,False,1434758400
1,56,HE,,,,,,,,,...,20,5,171,False,False,False,False,False,False,1434758400
2,69,HE,,,,,,,,,...,20,5,171,False,False,False,False,False,False,1434758400
3,77,HE,,,,,,,,,...,20,5,171,False,False,False,False,False,False,1434758400
4,111,HE,,,,,,,,,...,20,5,171,False,False,False,False,False,False,1434758400


In [15]:
#test = pd.read_csv(DATA_PATH/'example_data_row.csv', low_memory=False)
test.head()

Unnamed: 0,store,state,date,max_temperature_c,mean_temperature_c,min_temperature_c,dew_point_c,mean_dew_point_c,min_dew_point_c,max_humidity,...,promo_interval,day_of_week,sales,customers,open,promo,state_holiday,school_holiday,trend,week_start
0,1,HE,2015-06-20,17,14,11,9,7,5,88,...,,5,4097.0,494.0,1.0,0.0,0,0.0,85,2015-06-14


In [16]:
unknown_in_future = ['cloud_cover', 'customers', 'dew_point_c', 'events', 'max_gust_speed_km_h', 'max_humidity',
                     'max_sea_level_pressureh_pa', 'max_temperature_c', 'max_visibility_km', 'max_wind_speed_km_h',
                     'mean_dew_point_c', 'mean_humidity', 'mean_sea_level_pressureh_pa', 'mean_temperature_c', 
                     'mean_visibility_km', 'mean_wind_speed_km_h', 'min_dew_point_c',  'min_humidity',
                     'min_sea_level_pressureh_pa', 'min_temperature_c', 'min_visibility_km', 'precipitationmm', 'trend',
                     'wind_dir_degrees']

In [17]:
test[unknown_in_future] = np.NaN
test.head()

Unnamed: 0,store,state,date,max_temperature_c,mean_temperature_c,min_temperature_c,dew_point_c,mean_dew_point_c,min_dew_point_c,max_humidity,...,promo_interval,day_of_week,sales,customers,open,promo,state_holiday,school_holiday,trend,week_start
0,1,HE,2015-06-20,,,,,,,,...,,5,4097.0,,1.0,0.0,0,0.0,,2015-06-14


In [18]:
test.to_csv(DATA_PATH/'example_data_row.csv', index=False)

In [27]:
f = pd.read_csv(DATA_PATH/'fake_all_sales_double.csv', low_memory=False)
f.head()

Unnamed: 0,store,state,date,max_temperature_c,mean_temperature_c,min_temperature_c,dew_point_c,mean_dew_point_c,min_dew_point_c,max_humidity,...,promo_interval,day_of_week,sales,customers,open,promo,state_holiday,school_holiday,trend,week_start
0,1,HE,2015-06-20,17,14,11,9,7,5,88,...,,5,8236.636719,494.0,1.0,0.0,0,0.0,85,2015-06-14
1,56,HE,2015-06-20,17,14,11,9,7,5,88,...,"Mar,Jun,Sept,Dec",5,18314.673828,667.0,1.0,0.0,0,0.0,85,2015-06-14
2,69,HE,2015-06-20,17,14,11,9,7,5,88,...,"Jan,Apr,Jul,Oct",5,13903.62793,941.0,1.0,0.0,0,0.0,85,2015-06-14
3,77,HE,2015-06-20,17,14,11,9,7,5,88,...,"Jan,Apr,Jul,Oct",5,15056.893555,687.0,1.0,0.0,0,0.0,85,2015-06-14
4,111,HE,2015-06-20,17,14,11,9,7,5,88,...,"Jan,Apr,Jul,Oct",5,11922.927734,600.0,1.0,0.0,0,0.0,85,2015-06-14


In [28]:
f[unknown_in_future] = np.NaN
f.head()

Unnamed: 0,store,state,date,max_temperature_c,mean_temperature_c,min_temperature_c,dew_point_c,mean_dew_point_c,min_dew_point_c,max_humidity,...,promo_interval,day_of_week,sales,customers,open,promo,state_holiday,school_holiday,trend,week_start
0,1,HE,2015-06-20,,,,,,,,...,,5,8236.636719,,1.0,0.0,0,0.0,,2015-06-14
1,56,HE,2015-06-20,,,,,,,,...,"Mar,Jun,Sept,Dec",5,18314.673828,,1.0,0.0,0,0.0,,2015-06-14
2,69,HE,2015-06-20,,,,,,,,...,"Jan,Apr,Jul,Oct",5,13903.62793,,1.0,0.0,0,0.0,,2015-06-14
3,77,HE,2015-06-20,,,,,,,,...,"Jan,Apr,Jul,Oct",5,15056.893555,,1.0,0.0,0,0.0,,2015-06-14
4,111,HE,2015-06-20,,,,,,,,...,"Jan,Apr,Jul,Oct",5,11922.927734,,1.0,0.0,0,0.0,,2015-06-14


In [29]:
f.to_csv(DATA_PATH/'fake_all_sales_double.csv', index=False)

In [36]:
#sorted(['current_best', 'current_best99999-bkp', 'current_best-2019-03-08-16:17:05'], reverse=True)
sorted(['current_best', 'current_best-2019-03-01-00:00:00', 'current_best-2019-03-08-16:17:05','current_best-2019-03-11-16:17:05'], reverse=True)

['current_best-2019-03-11-16:17:05',
 'current_best-2019-03-08-16:17:05',
 'current_best-2019-03-01-00:00:00',
 'current_best']