In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from pylab import rcParams
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.stattools import acf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf

In [7]:
train = pd.read_csv('train.csv',index_col='id')
hol_events = pd.read_csv('holidays_events.csv')
oil = pd.read_csv('oil.csv')
stores = pd.read_csv('stores.csv',index_col='store_nbr')
transactions = pd.read_csv('transactions.csv')

In [8]:
train['date'] = pd.to_datetime(train['date'])
train['date'] = train['date'].dt.to_period('D')
hol_events['date'] = pd.to_datetime(hol_events['date'])
hol_events['date'] = hol_events['date'].dt.to_period('D')
oil['date'] = pd.to_datetime(oil['date'])
oil['date'] = oil['date'].dt.to_period('D')
transactions['date'] = pd.to_datetime(transactions['date'])
transactions['date'] = transactions['date'].dt.to_period('D')
train.sort_values(by=['date','store_nbr','family'],ascending=True,ignore_index=True,inplace=True)

In [9]:
label_encoder = LabelEncoder()

In [10]:
train['onpromotion'] = train['onpromotion'].astype(np.uint16)
train['family'] = label_encoder.fit_transform(train['family'])
family_dict = {num:name for num,name in zip(range(len(label_encoder.classes_)),label_encoder.classes_)}
train

Unnamed: 0,date,store_nbr,family,sales,onpromotion
0,2013-01-01,1,0,0.000,0
1,2013-01-01,1,1,0.000,0
2,2013-01-01,1,2,0.000,0
3,2013-01-01,1,3,0.000,0
4,2013-01-01,1,4,0.000,0
...,...,...,...,...,...
3000883,2017-08-15,54,28,59.619,0
3000884,2017-08-15,54,29,94.000,0
3000885,2017-08-15,54,30,915.371,76
3000886,2017-08-15,54,31,0.000,0


In [11]:
def make_label_dict(labels):
    return {num:name for num,name in zip(range(len(labels)),labels)}

In [12]:
hol_events['type'] = label_encoder.fit_transform(hol_events['type'])
type_dict = make_label_dict(label_encoder.classes_)
hol_events['locale'] = label_encoder.fit_transform(hol_events['locale'])
locale_dict = make_label_dict(label_encoder.classes_)
hol_events['locale_name'] = label_encoder.fit_transform(hol_events['locale_name'])
locale_name_dict = make_label_dict(label_encoder.classes_)
hol_events['description'] = label_encoder.fit_transform(hol_events['description'])
description_dict = make_label_dict(label_encoder.classes_)

In [13]:
#np.random.seed(3)
oil['interp'] = oil['dcoilwtico'].interpolate(method='linear')
#oil.loc[0,'interp'] = oil.loc[1,'interp'] + np.random.randn(1)

In [14]:
stores['city'] = label_encoder.fit_transform(stores['city'])
city_dict = make_label_dict(label_encoder.classes_)
stores['state'] = label_encoder.fit_transform(stores['state'])
state_dict = make_label_dict(label_encoder.classes_)
stores['type'] = label_encoder.fit_transform(stores['type'])
stores_type_dict = make_label_dict(label_encoder.classes_)

In [15]:
train = pd.merge(left=train, right=oil[['date','interp']], how='left', on='date')
train = pd.merge(left=train, right=stores, how='left', on='store_nbr')
#train = pd.merge(left=train, right=hol_events, how='left', on='store_nbr')
train = pd.merge(left=train, right=transactions, how='left', on=['date','store_nbr'])

In [16]:
train['day'] = train['date'].dt.day.astype(np.uint8)
train['weekday'] = train['date'].dt.weekday.astype(np.uint8)
train['week'] = train['date'].dt.week.astype(np.uint8)
train['month'] = train['date'].dt.month.astype(np.uint8)
train['year'] = train['date'].dt.year.astype(np.uint16)

In [17]:
oil['day'] = oil['date'].dt.day.astype(np.uint8)
oil['weekday'] = oil['date'].dt.weekday.astype(np.uint8)
oil['week'] = oil['date'].dt.week.astype(np.uint8)
oil['month'] = oil['date'].dt.month.astype(np.uint8)
oil['year'] = oil['date'].dt.year.astype(np.uint16)

In [18]:
train.loc[train['transactions'].isna(),'transactions'] = 0
train

Unnamed: 0,date,store_nbr,family,sales,onpromotion,interp,city,state,type,cluster,transactions,day,weekday,week,month,year
0,2013-01-01,1,0,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013
1,2013-01-01,1,1,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013
2,2013-01-01,1,2,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013
3,2013-01-01,1,3,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013
4,2013-01-01,1,4,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,54,28,59.619,0,47.57,5,10,2,3,802.0,15,1,33,8,2017
3000884,2017-08-15,54,29,94.000,0,47.57,5,10,2,3,802.0,15,1,33,8,2017
3000885,2017-08-15,54,30,915.371,76,47.57,5,10,2,3,802.0,15,1,33,8,2017
3000886,2017-08-15,54,31,0.000,0,47.57,5,10,2,3,802.0,15,1,33,8,2017


In [19]:
train[['sales']]

Unnamed: 0,sales
0,0.000
1,0.000
2,0.000
3,0.000
4,0.000
...,...
3000883,59.619
3000884,94.000
3000885,915.371
3000886,0.000


In [20]:
hol_events

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,3,0,15,25,False
1,2012-04-01,3,2,2,54,False
2,2012-04-12,3,0,3,18,False
3,2012-04-14,3,0,12,6,False
4,2012-04-21,3,0,19,8,False
...,...,...,...,...,...,...
345,2017-12-22,0,1,4,48,False
346,2017-12-23,0,1,4,47,False
347,2017-12-24,0,1,4,46,False
348,2017-12-25,3,1,4,44,False


train.index = train['date']
train

In [21]:
train['weekofyear'] = train['date'].dt.weekofyear
train.index = train['weekofyear']
train

Unnamed: 0_level_0,date,store_nbr,family,sales,onpromotion,interp,city,state,type,cluster,transactions,day,weekday,week,month,year,weekofyear
weekofyear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2013-01-01,1,0,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013,1
1,2013-01-01,1,1,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013,1
1,2013-01-01,1,2,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013,1
1,2013-01-01,1,3,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013,1
1,2013-01-01,1,4,0.000,0,,18,12,3,13,0.0,1,1,1,1,2013,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,2017-08-15,54,28,59.619,0,47.57,5,10,2,3,802.0,15,1,33,8,2017,33
33,2017-08-15,54,29,94.000,0,47.57,5,10,2,3,802.0,15,1,33,8,2017,33
33,2017-08-15,54,30,915.371,76,47.57,5,10,2,3,802.0,15,1,33,8,2017,33
33,2017-08-15,54,31,0.000,0,47.57,5,10,2,3,802.0,15,1,33,8,2017,33


In [22]:
for year in range(2013,2018):
    print(max(train.loc[train['year'] == year,'weekofyear']))

52
52
53
53
52


In [23]:
temp = [0,52,52,53,53]
for year in range(2013,2018):
    train.loc[train['year']==year,'weekofyear'] += sum(temp[:(year%2013)+1])

In [24]:
train['weekofyear']

weekofyear
1        1
1        1
1        1
1        1
1        1
      ... 
243    243
243    243
243    243
243    243
243    243
Name: weekofyear, Length: 3000888, dtype: int64

In [25]:
train.index = train['weekofyear']

In [26]:
train.set_index(['weekofyear','store_nbr','family'],inplace=True)

In [None]:
def cv_split(data, train_init_size, test_size, n_splits, gap=0):
    train_begin = 1
    train_end = train_begin + train_init_size
    test_begin = train_end + gap
    test_end = test_begin + test_size
    step = (243 - (train_init_size + test_size)) // n_splits
    step = step if step else 1
    while test_end <= 243:
        yield data.loc[range(train_begin,train_end)], data.loc[range(test_begin,test_end)]
        train_end += step
        test_begin += step
        test_end += step

In [None]:
train