# IMPORTS

In [1]:
import numpy as np
import pandas as pd

from datetime import datetime, date
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

# PLOTTING PARAMETERS

In [2]:
tex_fonts = {
        "text.usetex": True,
        "font.family": "serif",
        "axes.titlesize": 10, 
        "axes.labelsize": 10,
        "font.size": 10,
        "legend.fontsize": 'xx-small',
        "legend.title_fontsize": 10,
        "xtick.labelsize": 8, #'xx-small',
        "ytick.labelsize": 8, #'xx-small',
        "ps.usedistiller": "xpdf"
        }
plt.rcParams.update(tex_fonts)
plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath,amssymb,underscore}'

cm=1/2.54

# PATHS

In [3]:
path_data = Path('./input')

In [4]:
path_plots = Path('./plots')

In [5]:
def plot_ts_fit(train, fit, xlab, ylab, title, path, filename, cm):
    
    fig = plt.figure(figsize=(18*cm, 12*cm))
    ax = fig.add_subplot(111)
    
    #ax1 = ax.plot(average_sales.dayofyear,average_sales.sales, c='blue', label = 'Training data')
    #ax2 = ax.plot(average_sales.dayofyear,y_pred, c='red')
    ax1 = ax.plot(train.dayofyear,train.sales, c='blue', label = 'Training data')
    ax2 = ax.plot(train.dayofyear,fit, c='red', label = 'Fit to the training data')
    
    ax.set_xlabel(xlab)
    ax.set_ylabel(ylab)
    ax.set_title(title)
    
    ax.legend(title=None,
              loc="upper right",
              )
    
    fig.tight_layout()
    file = path / str(filename)
    fig.savefig(file, format='pdf')

# DATA

## Load clean data

In [6]:
# IMPORT CLEAN DATA FROM THE data_cleaning notebook

input_filename = path_data / 'clean_data.gz'
data = pd.read_csv(input_filename, 
                   compression=dict(method='gzip'),
                   index_col='index',
                   dtype={#'store_nbr': 'category',
                          'dayname': 'category',
                         },
                   parse_dates=['date'],
                   infer_datetime_format=True,
                  ) 
data['date'] = pd.to_datetime(data['date'])
'''
csv I/O removes any categorical information (categories and ordering). 
Such info has to be re initialized, or stored in a hdf files
'''
listdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
liststore = list(range(1, 55))
# pd.Categorical() to set the order according to the 'days' list
data.dayname = pd.Categorical(data.dayname, categories=listdays, ordered=True)
data.store_nbr = pd.Categorical(data.store_nbr, categories=liststore, ordered=True)
data['dayofmonth'] = data.date.dt.day

data.head()

  data = pd.read_csv(input_filename,


Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion,city,state,store_type,cluster,locale,...,dayname,dayofyear,week,year,Terremoto,Futbol,CyberMonday,BlackFriday,Madre,dayofmonth
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13,National,...,Tuesday,1,1,2013,False,False,False,False,False,1
1,2016-01-01,21,POULTRY,0.0,0,Santo Domingo,Santo Domingo de los Tsachilas,B,6,National,...,Friday,1,53,2016,False,False,False,False,False,1
2,2016-01-01,21,PREPARED FOODS,0.0,0,Santo Domingo,Santo Domingo de los Tsachilas,B,6,National,...,Friday,1,53,2016,False,False,False,False,False,1
3,2016-01-01,21,PRODUCE,0.0,0,Santo Domingo,Santo Domingo de los Tsachilas,B,6,National,...,Friday,1,53,2016,False,False,False,False,False,1
4,2016-01-01,21,SCHOOL AND OFFICE SUPPLIES,0.0,0,Santo Domingo,Santo Domingo de los Tsachilas,B,6,National,...,Friday,1,53,2016,False,False,False,False,False,1


## Prepare train data

In [7]:
train = data.loc[data.year==2017].set_index(['store_nbr', 'family', 'date']).sort_index()
# '', '','Madre',
train.Madre.unique()

array([False,  True])

In [8]:
# Terremoto always 'False' (it took place in 2016)
# Futbol always 'False' (it took place in 2014)
# BlackFriday always 'False' (it takes place after the end of the test data)
# CyberMonday always 'False' (it takes place after the end of the test data)


In [9]:
# Create features for dates in averages_sales

fourier = CalendarFourier(freq='M', order=4)

dp = DeterministicProcess(
    index=y.index,                # dates from the training data
    constant=True,               # dummy feature for the bias (y_intercept)
    order=1,                      # the time dummy (trend)
    seasonal=True,                # weekly seasonality (dealt with via indicators) 
    additional_terms = [fourier], # monthly seasonality (dealt with via Fourier coefficients)
    drop=True,                    # drop terms if necessary to avoid collinearity
)

X = dp.in_sample()

NameError: name 'CalendarFourier' is not defined

## Stores data

In [None]:
stores = pd.read_csv(
    path_data / 'stores.csv',
    dtype={'store_nbr': 'category',
           'city': 'category',
           'state': 'category',
           'type': 'category',
           'cluster': 'category'
           },
    )
# type already use as column name
stores.rename(columns={'type': 'store_type'}, inplace=True)
stores.head()

## Test data

In [None]:
test_in = pd.read_csv(
    path_data / 'test.csv',
    usecols=['store_nbr', 'family', 'date', 'onpromotion'],
    dtype={'store_nbr': 'category',
           'family': 'category',
           'onpromotion': 'uint32',
          },
    parse_dates=['date'],
    infer_datetime_format=True,
    )

pd.to_datetime(test_in['date'])
test_in.head()

In [None]:
# Merge test and stores
test2 = test_in.merge(stores, on='store_nbr', how='left')
test2.head()

In [None]:
# Import holidays in the test data
hol_test = pd.read_csv(path_data / 'hol_test.csv',
                       usecols = ['date', 'locale', 'locale_name', 'status', 'Terremoto', 'Futbol', 'CyberMonday', 'BlackFriday', 'Madre'],
                       dtype={#'store_nbr': 'category',
                              'dayname': 'category',
                              },
                       parse_dates=['date'],
                       infer_datetime_format=True,
                      ) 
hol_test['date'] = pd.to_datetime(hol_test['date'])
hol_test

In [None]:
# Merge the outcome with holidays
test3 = test2.merge(hol_test, on='date', how='left')

# At this stage we have a lot of NaNs comimg from hol_test
# except the single day with holiday information in the test sample
# this is a local holiday at the city 'Ambato'

# fill in status with 'holiday' only when 'city' == 'locale_name' (i. e., Ambato)
# data in hol_test is simple, so next line is not necessary
# test3.loc[~test3.status.isna() and (test3.city==test3.locale_name), 'status'] = 'holiday'

# else fill in with 'work'
test3.loc[test3.city!=test3.locale_name, 'status'] = 'work'

# fill in NaNs
test3.loc[test3.Terremoto.isna(), 'Terremoto'] = False
test3.loc[test3.Futbol.isna(), 'Futbol'] = False
test3.loc[test3.CyberMonday.isna(), 'CyberMonday'] = False
test3.loc[test3.BlackFriday.isna(), 'BlackFriday'] = False
test3.loc[test3.Madre.isna(), 'Madre'] = False

# add boolean column 'workday'
test3['Workday'] = np.where(test3['status']=='Work', False, True)

# add boolean column 'promotion'
test3['Promotion'] = np.where(test3['onpromotion']>0, False, True)

# drop columns (locale, locale name)
test3.drop(columns=['locale', 'locale_name'], inplace=True)
# sanity check
test3.loc[test3.status=='holiday'].city.unique()

In [None]:
test3.head()