In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import numpy as np
import pandas as pd

In [3]:
from utils import preprocessing_utils, general_utils

0it [00:00, ?it/s]

Load datasets: train, test, store

In [4]:
DATA_PATH = 'data/'

In [7]:
train_df = pd.read_csv(DATA_PATH + 'train.csv', low_memory=False)
test_df = pd.read_csv(DATA_PATH + 'test.csv', low_memory=False)
store_df = pd.read_csv(DATA_PATH + 'store.csv')

In [8]:
train_df['Date'] = pd.to_datetime(train_df['Date'], infer_datetime_format=True)
test_df['Date'] = pd.to_datetime(test_df['Date'], infer_datetime_format=True)

In [9]:
print(train_df['Date'].min(), train_df['Date'].max())
print(test_df['Date'].min(), test_df['Date'].max())

2013-01-01 00:00:00 2015-07-31 00:00:00
2015-08-01 00:00:00 2015-09-17 00:00:00


We have data 2013, 2014 and 2015 till 2015-07-31. \
We need to predict sales for the next 1,5 month i.e. 47 days.

## Data Preparation

Fill missing values

In [10]:
test_df['Open'] = test_df.groupby('Store')['Open'].ffill()
test_df['Open'].fillna(0, inplace=True)

In [11]:
test_df.isna().sum()

Id               0
Store            0
DayOfWeek        0
Date             0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

Remove records for the dates when stores were closed. \
WHY? \
To prevent bias
HOW? \
1. Select indeces of records for the dates when stores were closed.
2. Save information about promos and holidays on that dates.
3. Fill the gaps in date range
3. Fill the gaps in sales and customers values by interpolation
4. Fill missing values in promos and holidays from the previously saved data where it's possible
5. Fill left missing values with the nearest values

In [12]:
train_df = preprocessing_utils.interpolate_sales_customers(train_df, method='nearest')

2021-02-23 15:34:38,362 | INFO : Shape before removal: (1017209, 9)
2021-02-23 15:34:38,739 | INFO : Shape after removal: (844392, 9)
2021-02-23 15:34:41,074 | INFO : Shape after filling date gaps: (1048712, 9)
2021-02-23 15:34:43,893 | INFO : Number of missing values in df: 0


In [13]:
train_df.head()

Unnamed: 0,Date,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,2013-01-02,1,3.0,5530.0,668.0,1.0,0.0,0,1.0
1,2013-01-03,1,4.0,4327.0,578.0,1.0,0.0,0,1.0
2,2013-01-04,1,5.0,4486.0,619.0,1.0,0.0,0,1.0
3,2013-01-05,1,6.0,4997.0,635.0,1.0,0.0,0,1.0
4,2013-01-06,1,6.0,4997.0,635.0,0.0,0.0,0,1.0


### Preprocess store data

In [14]:
store_df = preprocessing_utils.fill_nans_store_df(store_df)

2021-02-23 15:34:43,970 | INFO : Fill NaN in CompetitionDistance, CompetitionOpenSinceMonth, CompetitionOpenSinceYear with 0
2021-02-23 15:34:43,975 | INFO : Fill NaN in Promo2SinceWeek, Promo2SinceYear with 0
2021-02-23 15:34:43,978 | INFO : Fill NaN in PromoInterval with ""


In [15]:
store_df, store_label_encoder = preprocessing_utils.transform_store_df(store_df, DATA_PATH + 'store_cat_cols_le')

2021-02-23 15:34:44,013 | INFO : Encode labels in ['StoreType', 'Assortment', 'PromoInterval']
2021-02-23 15:34:44,023 | INFO : Transform log1p CompetitionDistance
2021-02-23 15:34:44,025 | INFO : Transform date features
2021-02-23 15:34:44,038 | INFO : Save label encoder to data/store_cat_cols_le.pkl


In [16]:
store_label_encoder

{'StoreType': {'a': 0, 'b': 1, 'c': 2, 'd': 3},
 'Assortment': {'a': 0, 'b': 1, 'c': 2},
 'PromoInterval': {'': 0,
  'Feb,May,Aug,Nov': 1,
  'Jan,Apr,Jul,Oct': 2,
  'Mar,Jun,Sept,Dec': 3}}

### Preprocess train/test

In [17]:
test_df['Sales'] = np.nan
test_df['Customers'] = np.nan
sales_df = pd.concat([train_df, test_df], ignore_index=True)

In [18]:
train_df['StateHoliday'].unique()

array(['0', 'b', 'a', 'c'], dtype=object)

In [19]:
sales_df, sales_label_encoder = preprocessing_utils.transform_sales_df(sales_df, DATA_PATH + 'sales_cat_cols_le')

2021-02-23 15:34:47,463 | INFO : Add Day, Month, Year features
2021-02-23 15:34:47,586 | INFO : Transform date features
2021-02-23 15:34:48,365 | INFO : Encode StateHoliday
2021-02-23 15:34:48,365 | INFO : Encode labels in ['StateHoliday']
2021-02-23 15:34:48,506 | INFO : Save label encoder to data/sales_cat_cols_le.pkl


In [20]:
data = sales_df.merge(store_df, on='Store', how='left')

In [21]:
data.drop(['Year', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'Id'], axis=1, inplace=True)

In [22]:
data.shape

(1089800, 23)

In [23]:
data.isna().sum()

Date                                 0
Store                                0
Sales                            41088
Customers                        41088
Open                                 0
Promo                                0
StateHoliday                         0
SchoolHoliday                        0
DayOfWeek_sin                        0
DayOfWeek_cos                        0
Month_sin                            0
Month_cos                            0
Day_sin                              0
Day_cos                              0
StoreType                            0
Assortment                           0
CompetitionDistance                  0
Promo2                               0
PromoInterval                        0
CompetitionOpenSinceMonth_sin        0
CompetitionOpenSinceMonth_cos        0
Promo2SinceWeek_sin                  0
Promo2SinceWeek_cos                  0
dtype: int64

In [25]:
data.to_pickle(DATA_PATH + 'unscaled_sales_data.pickle')

## Prepare data for model training

Validation strategy: Side-by-side split\
Dataset splits into independent parts, one part used strictly for training and another part used strictly for validation.

Data range 2013 to 2015-07-31. \
Validation Model is trained on 2013 to 2015 data and predict 47 days of 2015 data.
The best performing model will be trained on 2013 to 2015-07-31 data to predict 47 days from 2015-08-01 without validation. \

In the validation model sequences of the last 47 days of 2014 shouldn't be included because it contains 2015 values in y.

In [26]:
data['Date'].min()

Timestamp('2013-01-01 00:00:00')

In [27]:
data['Date'].max()

Timestamp('2015-09-17 00:00:00')

In [29]:
full_range = pd.date_range(data['Date'].min(), data['Date'].max())

In [34]:
full_range[full_range<'2014-11-15']

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10',
               ...
               '2014-11-05', '2014-11-06', '2014-11-07', '2014-11-08',
               '2014-11-09', '2014-11-10', '2014-11-11', '2014-11-12',
               '2014-11-13', '2014-11-14'],
              dtype='datetime64[ns]', length=683, freq='D')

### Normalize Data

In [6]:
cols_to_scale = {'td': ['Sales', 'Customers'],
                 'ti': ['Assortment', 'CompetitionDistance']
                }
val_date = None
mode = 'val'
if mode == 'val':
    val_date = '2014-11-15'

In [48]:
scaled_data, scale_map = preprocessing_utils.scale_data(data,
                                                        DATA_PATH + f'scalemap_{cols_to_scale}',
                                                        DATA_PATH + f'scaled_data_{mode}',
                                                        cols_to_scale,
                                                        mode=mode,
                                                        val_date=val_date
                                                       )

2021-02-23 16:59:15,910 | INFO : Log Transform Sales and Customers
2021-02-23 16:59:15,962 | INFO : Select records for scaler fitting
2021-02-23 16:59:16,078 | INFO : Start scaling time-dependant features ['Sales', 'Customers']
0it [02:00, ?it/s]
0it [00:04, ?it/s]
100%|██████████| 1115/1115 [03:39<00:00,  5.08it/s]
2021-02-23 17:02:55,594 | INFO : Start scaling time-independant features ['Assortment', 'CompetitionDistance']
2021-02-23 17:02:55,634 | INFO : Convert data formats to reduce memory usage
2021-02-23 17:02:56,030 | INFO : Save pickle files


Mem. usage decreased to 58.20 Mb (74.1% reduction)


In [53]:
scale_map.keys()

dict_keys(['Sales', 'Customers', 'Assortment', 'CompetitionDistance'])

In [54]:
len(scale_map['Sales'])

1115

In [55]:
len(scale_map['Assortment'])

2

### Build data sequences

In [57]:
scaled_data.columns

Index(['Date', 'Store', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Month_sin',
       'Month_cos', 'Day_sin', 'Day_cos', 'StoreType', 'Assortment',
       'CompetitionDistance', 'Promo2', 'PromoInterval',
       'CompetitionOpenSinceMonth_sin', 'CompetitionOpenSinceMonth_cos',
       'Promo2SinceWeek_sin', 'Promo2SinceWeek_cos', 'Sales_mean',
       'Customers_mean', 'Assortment_mean', 'CompetitionDistance_mean'],
      dtype='object')

In [7]:
n_steps_in = 90
input_data_filename = DATA_PATH + f'scaled_data_{mode}'
output_data_filename = DATA_PATH + f'sequence_data_{mode}'

In [8]:
input_data_filename

'data/scaled_data_val'

In [9]:
output_data_filename

'data/sequence_data_val'

In [10]:
! python build_sequence.py 'data/scaled_data_val' 'data/sequence_data_val' 90 

0it [00:00, ?it/s]2021-02-23 21:43:20,203 | INFO : data/scaled_data_val
2021-02-23 21:43:20,204 | INFO : Time-dependant features: ['Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos']
2021-02-23 21:43:20,204 | INFO : Time-independent features: ['Date', 'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2', 'PromoInterval', 'CompetitionOpenSinceMonth_sin', 'CompetitionOpenSinceMonth_cos', 'Promo2SinceWeek_sin', 'Promo2SinceWeek_cos', 'Sales_mean', 'Customers_mean', 'Assortment_mean', 'CompetitionDistance_mean']
2021-02-23 21:43:20,204 | INFO : Target Feature: Sales
2021-02-23 21:43:20,204 | INFO : Load pickle file: data/scaled_data_val
2021-02-23 21:43:20,312 | INFO : Start building sequences
0it [00:00, ?it/s]
0it [00:00, ?it/s]                                        | 0/6 [00:00<?, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s

In [12]:
seq_data = general_utils.open_pickle_file(output_data_filename)

### Prepare pytorch dataloader

In [43]:
from utils.preprocessing_utils import StoreDataset

In [69]:
from torch.utils.data import Dataset, DataLoader

0it [00:00, ?it/s]

In [44]:
val_date

'2014-11-15'

In [50]:
pd.to_datetime('2013-04-01') - pd.to_datetime('2013-01-01')

Timedelta('90 days 00:00:00')

In [51]:
test_sequence_data = seq_data[seq_data['Date'] == '2015-08-01']

In [55]:
if mode == 'val':
    train_sequence_data = seq_data[(seq_data['Date'] >= '2013-04-01') & (seq_data['Date'] <= val_date)]
    valid_sequence_data = seq_data[(seq_data['Date'] > val_date) & (seq_data['Date'] <= '2015-01-01')]
else:    
    train_sequence_data = seq_data[(seq_data['Date'] >= '2013-04-01') & (seq_data['Date'] <= '2015-07-31') ]
    valid_sequence_data = pd.DataFrame()

In [56]:
print(train_sequence_data.shape, valid_sequence_data.shape, test_sequence_data.shape)

(660729, 17) (52405, 17) (856, 17)


In [87]:
cat_columns = ['StoreType', 'Promo2', 'PromoInterval']
num_columns = ['CompetitionDistance', 'CompetitionOpenSinceMonth_sin', 'CompetitionOpenSinceMonth_cos',
               'Promo2SinceWeek_sin', 'Promo2SinceWeek_cos', 'Sales_mean',
               'Customers_mean', 'Assortment_mean', 'CompetitionDistance_mean'
              ]

In [241]:
train_dataset = StoreDataset(cat_columns=cat_columns, num_columns=num_columns, embed_vector_size=50, ohe_cat_columns=True, cat_columns_to_decoder=True, decoder_input=False)
valid_dataset = StoreDataset(cat_columns=cat_columns, num_columns=num_columns, embed_vector_size=50, ohe_cat_columns=True, cat_columns_to_decoder=True, decoder_input=False)
test_dataset = StoreDataset(cat_columns=cat_columns, num_columns=num_columns, embed_vector_size=50, ohe_cat_columns=True, cat_columns_to_decoder=True, decoder_input=False)

2021-02-23 21:40:16,215 | INFO : Create Dataset object
2021-02-23 21:40:16,216 | INFO : Create Dataset object
2021-02-23 21:40:16,217 | INFO : Create Dataset object


In [242]:
train_dataset.load_sequence_data(train_sequence_data)
valid_dataset.load_sequence_data(valid_sequence_data)
test_dataset.load_sequence_data(test_sequence_data)

2021-02-23 21:40:16,778 | INFO : Load data
2021-02-23 21:40:16,780 | INFO : Load data
2021-02-23 21:40:16,781 | INFO : Load data


In [243]:
train_dataset.process_cat_columns()

In [244]:
valid_dataset.process_cat_columns()

In [245]:
test_dataset.process_cat_columns()

In [246]:
train_dataset.cat_embed_shape

[(4, 2), (2, 1), (4, 2)]

In [247]:
len(cat_columns)

3

In [248]:
len(num_columns)

9

In [249]:
12 + len(num_columns) + 2 + 1 + 2

26

In [250]:
batch_size = 256

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False)
print(len(train_dataloader), len(valid_dataloader))

2581 205


In [253]:
next(iter(train_dataloader))[1]

tensor([[[ 0.8193,     nan],
         [-0.0476,     nan],
         [-0.4348,     nan],
         ...,
         [-0.2498,     nan],
         [ 0.2030,     nan],
         [-0.2908,     nan]],

        [[-0.0476,     nan],
         [-0.4348,     nan],
         [-0.3181,     nan],
         ...,
         [ 0.2030,     nan],
         [-0.2908,     nan],
         [-0.2908,     nan]],

        [[-0.4348,     nan],
         [-0.3181,     nan],
         [ 0.4282,     nan],
         ...,
         [-0.2908,     nan],
         [-0.2908,     nan],
         [ 0.1318,     nan]],

        ...,

        [[-0.4250,     nan],
         [-0.7100,     nan],
         [ 0.0386,     nan],
         ...,
         [ 0.4512,  0.7275],
         [-0.1367,  0.8232],
         [-0.1367,  0.9199]],

        [[-0.7100,     nan],
         [ 0.0386,     nan],
         [ 1.3643,     nan],
         ...,
         [-0.1367,  0.8232],
         [-0.1367,  0.9199],
         [-1.3682,  0.5083]],

        [[ 0.0386,     nan],
       

In [252]:
(X_con, X_dec), y = next(iter(train_dataloader))
X_con.shape, y.shape, X_dec.shape

ValueError: too many values to unpack (expected 2)

In [190]:
train_dataset[0][0][0]

tensor([[ 0.8613,  1.1084,  1.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3936,  0.1747,  1.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2056,  0.6206,  1.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 2.1094,  2.0195,  1.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 2.1094,  2.0195,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.8193,  0.7241,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])