In [1]:
import numpy as np
import pandas as pd
# !pip install pytorch_lightning
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import Trainer

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
import numpy as np

from pathlib import Path

data_dir = Path.home()/'data/kaggle/m5-forecasting-accuracy'

In [2]:
!ls $data_dir

calendar.csv		     sales_train_validation.csv  sell_prices.csv
m5-forecasting-accuracy.zip  sample_submission.csv


#### Sales

In [3]:
%%time
sales = pd.read_csv(data_dir/'sales_train_validation.csv')
print(f'sales.shape: {sales.shape}')
cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

encoders = {}
for col in cat_cols:
    encoder =  OrdinalEncoder()
    sales[[col]] = encoder.fit_transform(sales[[col]])
    sales[col] = sales[col].astype(np.int16)
    encoders[col] = encoder

sales.shape: (30490, 1919)
CPU times: user 6.31 s, sys: 1.04 s, total: 7.35 s
Wall time: 7.35 s


In [4]:
sales.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1904', 'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910',
       'd_1911', 'd_1912', 'd_1913'],
      dtype='object', length=1919)

#### Calendar

In [5]:
%%time
calendar = pd.read_csv(data_dir/'calendar.csv')\
            .rename(columns={'d':'day'})

cat_cal_cols = ['wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI']
# ignore_cal_cols = ['wm_yr_wk']

for col in cat_cal_cols:
    
    # impute
    if str(calendar[col].dtype)[:3] == 'obj':
        fill_value = 'abcxyz' 
    elif str(calendar[col].dtype)[:3] == 'int':
        fill_value = -1
    calendar[[col]] = SimpleImputer(strategy='constant', fill_value=fill_value).fit_transform(calendar[[col]])
    
    # encode
    if col not in encoders:
        encoders[col] = OrdinalEncoder().fit(calendar[[col]])
    calendar[[col]] = encoders[col].transform(calendar[[col]])

CPU times: user 123 ms, sys: 10.6 ms, total: 133 ms
Wall time: 132 ms


In [6]:
calendar.columns

Index(['date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'day',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI'],
      dtype='object')

#### Prices

In [7]:
%%time
prices = pd.read_csv(data_dir/'sell_prices.csv')
for col in ['store_id', 'item_id', 'wm_yr_wk']:
    prices[[col]] = encoders[col].transform(prices[[col]])

CPU times: user 9.73 s, sys: 2.19 s, total: 11.9 s
Wall time: 11.9 s


In [9]:
%%time
sales2 = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                                       var_name='day', value_name='demand')

CPU times: user 5.05 s, sys: 2.86 s, total: 7.91 s
Wall time: 7.91 s


In [10]:
sales2.shape

(58327370, 8)

In [11]:
%%time
sales3 = sales2.merge(calendar, on='day', how='left')
sales4 = sales3.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

CPU times: user 48 s, sys: 32.4 s, total: 1min 20s
Wall time: 1min 20s


In [12]:
sales4.shape

(58327370, 22)

In [14]:
%%time
sales4.isnull().sum(axis=0)

CPU times: user 17.5 s, sys: 12.9 s, total: 30.4 s
Wall time: 30.4 s


id                     0
item_id                0
dept_id                0
cat_id                 0
store_id               0
state_id               0
day                    0
demand                 0
date                   0
wm_yr_wk               0
weekday                0
wday                   0
month                  0
year                   0
event_name_1           0
event_type_1           0
event_name_2           0
event_type_2           0
snap_CA                0
snap_TX                0
snap_WI                0
sell_price      12299413
dtype: int64

In [15]:
sales4.dtypes

id               object
item_id           int16
dept_id           int16
cat_id            int16
store_id          int16
state_id          int16
day              object
demand            int64
date             object
wm_yr_wk        float64
weekday         float64
wday            float64
month           float64
year            float64
event_name_1    float64
event_type_1    float64
event_name_2    float64
event_type_2    float64
snap_CA         float64
snap_TX         float64
snap_WI         float64
sell_price      float64
dtype: object

In [16]:
%%time
sales4.to_parquet('combined.pq')

CPU times: user 47.7 s, sys: 9.26 s, total: 56.9 s
Wall time: 49.3 s
