In [2]:
# default_exp m5.dataprep

# Data preparation & feature engineering

> Description of all calculated features.

Different kind of features will be computed here :

   - Time series features
   - Lags of the target variable (sales) and some variations of it (rolling means, aggregations)
   - Pricing features
   - Cumulative means of sales on different time frames and at various levels of aggregation
   - Encoding for all categorical variables
   
More features than needed will be calculated, in order to be able to select the best afterwards. 

Before going into more details about each kind of predictor, the following chart shows the **best-performing features**, classified by category, based on the selection method that will be described later.

In [3]:
#hide
from IPython.display import Image

NB: This chart only shows the best-performing features, therefore even the "one star" ones are among the most relevant ones.

## Utils

In [4]:
#hide
import os
import inspect
import pandas as pd
import numpy as np
import json
from io import BytesIO
from google.cloud import storage
import time
import datetime
from datetime import timedelta
import warnings
import logging
from functools import reduce

from functools import wraps
import datetime as dt

from catboost import CatBoostRegressor, Pool, cv

from dateutil.relativedelta import relativedelta
from copy import deepcopy

logging.basicConfig(level=logging.INFO)
warnings.filterwarnings('ignore')

The following versions of libraries have been used :

In [5]:
print('Pandas :',pd.__version__)
print('Numpy :',np.__version__)

Pandas : 1.3.4
Numpy : 1.20.3


In [6]:
#hide
class GCSConnector:
    """
    Object: GCSConnector(Object)
    Purpose: Connector to the GCS account
    """

    def __init__(self, bucketname, project_id):
        """
        Initialize Google Cloud Storage Connector to bucket
        :param1 bucketname: (str) bucket name
        :param2 project_id: (str) projet id
        """
        client = storage.Client(project=project_id)
        self._bucket = client.get_bucket(bucketname)

    def get_file(self, filename):
        """
        Get file content from GCS
        :param filename:
        :return: (BytesIO) GCS File as byte
        """
        blob = storage.Blob(filename, self._bucket)
        content = blob.download_as_string()
        return BytesIO(content)

    def send_json(self, json_file, filename):
        """
        :param json_file:
        :param filename:
        :return:
        """
        self._bucket.blob(filename).upload_from_string(json.dumps(json_file, ensure_ascii=False))

    def send_dataframe(self, df, filename, **kwargs):
        """
        :param filename:
        :param kwargs:
        :return:
        """
        self._bucket.blob(filename).upload_from_string(
            df.to_csv(index=False, **kwargs), content_type="application/octet-stream")

    def open_csv_as_dataframe(self, filename, **kwargs):
        """
        :param filename:
        :param kwargs:
        :return:
        """
        return pd.read_csv(self.get_file(filename=filename), **kwargs)

    def open_csv_as_dataframe_dtype(self, filename, dtypes_dict, **kwargs):
        """
        :param filename:
        :param kwargs:
        :return:
        """
        return pd.read_csv(self.get_file(filename=filename), dtype=dtypes_dict, **kwargs)

    def open_json_as_dataframe(self, filename, **kwargs):
        """
        :param filename:
        :param kwargs:
        :return:
        """
        return pd.read_json(self.get_file(filename=filename), **kwargs)

    def open_excel_as_dataframe(self, filename, **kwargs):
        """
        :param filename:
        :param kwargs:
        :return:
        """
        return pd.read_excel(self.get_file(filename=filename), **kwargs)

    def file_exists(self, filename):
        """
        Check if 'filename' file exists within bucket
        :param filename:
        :return: (Bool)
        """
        return storage.Blob(filename, self._bucket).exists(self._gcsclient)

    def list_files(self, prefix, delimiter=None):
        return [blob.name for blob in self._bucket.list_blobs(prefix=prefix, delimiter=delimiter)]

In [7]:
#hide
# Working locally or on a VM

if os.getcwd().split('/')[1] == 'Users':  # Local
    stored_locally = True
    currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
    parentdir = os.path.dirname(currentdir)
    main_path = parentdir + "/data/"
    
elif os.getcwd().split('/')[1] == 'home':  # VM
    stored_locally = False
    bucketname = "m5-forecast"
    project_id = "data-sandbox-fr"
    GCS_CONNECTOR = GCSConnector(bucketname, project_id)

In [8]:
#IMPORTANT: where to find the data? 
# inside the project data-sandbox-fr on GCP, inside Google Cloud Storage
# in the bucket m5-forecast/camille-uncertainty-with-catboost
# you can use the folder raw_data

# Input files path
calendar_path = "raw_data/calendar.csv"
sales_path = "raw_data/sales_train_evaluation.csv"
prices_path = "raw_data/sell_prices.csv"

In [9]:
#hide
# Cross-validation scheme

END_PRED = 1968 
START_PRED = END_PRED - 27
DAYS_PRED = list(range(START_PRED, END_PRED + 1))

END_VALID1 = START_PRED - 1
START_VALID1 = END_VALID1 - 27
DAYS_VALID1 = list(range(START_VALID1, END_VALID1 + 1))  # 28 days period before pred

END_VALID2 = START_VALID1 - 1
START_VALID2 = END_VALID2 - 27
DAYS_VALID2 = list(range(START_VALID2, END_VALID2 + 1))  # 28 days period before

END_VALID3 = START_VALID2 - 1
START_VALID3 = END_VALID3 - 27
DAYS_VALID3 = list(range(START_VALID3, END_VALID3 + 1))  # 28 days period before

END_VALID4 = END_PRED - 366
START_VALID4 = END_VALID4 - 27
DAYS_VALID4 = list(range(START_VALID4, END_VALID4 + 1))  # 1 year before pred

END_VALID5 = END_VALID4 - 365
START_VALID5 = END_VALID5 - 27
DAYS_VALID5 = list(range(START_VALID5, END_VALID5 + 1))  # 2 years before pred

END_TRAIN1 = START_VALID1 - 1
END_TRAIN2 = START_VALID2 - 1
END_TRAIN3 = START_VALID3 - 1
END_TRAIN4 = START_VALID4 - 1
END_TRAIN5 = START_VALID5 - 1
END_TRAIN_all = START_PRED - 1

START_TRAIN = 0

DAYS_TRAIN1 = list(range(START_TRAIN, END_TRAIN1 + 1))
DAYS_TRAIN2 = list(range(START_TRAIN, END_TRAIN2 + 1))
DAYS_TRAIN3 = list(range(START_TRAIN, END_TRAIN3 + 1))
DAYS_TRAIN4 = list(range(START_TRAIN, END_TRAIN4 + 1))
DAYS_TRAIN5 = list(range(START_TRAIN, END_TRAIN5 + 1))
DAYS_TRAIN_all = list(range(START_TRAIN, END_TRAIN_all + 1))

In [10]:
#export

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = datetime.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(datetime.datetime.now() - tic)
        logging.info(f" -- Step {func.__name__} took {time_taken}s - {tic} --")
        return result

    return wrapper

A few functions will be needed to do all the feature engineering :

The function `reduce_mem_usage` optimizes columns types to reduce memory usage when computing the features.

In [11]:
#export
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type) in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem,
                                                                              100 * (start_mem - end_mem) / start_mem))
    return df

The function `merge_by_concat` allows to merge two dataframes without losing dtypes information.

In [12]:
#export
def merge_by_concat(df1, df2, merge_on):
    merged_df = df1[merge_on]
    merged_df = merged_df.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_df) if col not in merge_on]
    df1 = pd.concat([df1, merged_df[new_columns]], axis=1)
    return df1

When computing a group of new features, only some features computed previously need to be kept in memory (as they will be needed to calculate the new ones). All features not needed can be dropped. That's what the function `get_input_df` is used for.

In [13]:
#export
def get_input_df(input_cols):
    index_cols = ['id','date_block_num_day']
    input_df = features_time.copy()
    return input_df[index_cols + input_cols]

Once a group of features is calculated and stored in a dataframe, the function `feature_formatting` is used for :
 - Keeping only the columns needed
 - Optimizing memory usage 
 - Printing features information (dtypes and NaN)

In [14]:
#export
def feature_formatting(df, input_cols):
    index_cols = ['id','date_block_num_day']
    df = df.drop(input_cols, axis=1)
    cols_to_keep = index_cols + [i for i in df.columns if i not in index_cols]
    df = df[cols_to_keep]
    df = reduce_mem_usage(df)
    print(df.drop(index_cols,axis=1).info(null_counts=True,memory_usage=False))
    return df

## Data loading

The dataset is composed of 3 files : sales, prices and calendar data. All input files can be found on [GCS](https://console.cloud.google.com/storage/browser/m5-forecast;tab=objects?forceOnBucketsSortingFiltering=false&project=data-sandbox-fr&prefix=). To execute this notebook and the following ones, put all the files in a folder named `data` located in the same directory as the notebooks.

In [15]:
sales_df = pd.read_csv(f"{sales_path}")
calendar_df = pd.read_csv(f"{calendar_path}")
prices_df = pd.read_csv(f"{prices_path}")

**Sales data** contains historical sales count for the last 5 years, for each product in every location.

In [16]:
sales_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


**Calendar data** is composed of :
 - Date information
 - Calendar events
 - Snap days (days when people get benefits to buy food) for each state

In [17]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


**Prices data** gives the historical price of each item in each store (prices can change every week).

In [18]:
prices_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In order to run everything quickly and avoid memory issues, we will work here on a subset (300 items out of the 3049 in total).

In [19]:
all_items = sales_df['item_id'].unique()
subset_size = 300
subset_items = np.random.choice(all_items,subset_size,replace=False)
sales_df = sales_df.loc[sales_df['item_id'].isin(subset_items)]

## Pre-processing

Some pre-processing is applied before calculating features :

 - Sales data is melted, to get one row per item x store and per day.
 - Rows for days to forecast are concatenated at the bottom of sales data, to compute features for these days as well later.
 - Calendar data, prices data and sales data are merged.
 - We don't want to consider rows where sales = 0 when this happens even before the product was released. So for each item x store, we calculate the date of the first sale of the product in the store ("release" date). Then we keep only the rows whose date is after the release date.

In [20]:
#export
@log_step
def format_calendar_data(calendar):
    calendar["date"] = pd.to_datetime(calendar["date"], format='%Y-%m-%d')
    cols_calendar1 = ['date','d','wm_yr_wk']
    cols_calendar2 = ['event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']
    calendar = calendar[cols_calendar1 + cols_calendar2]
    for col in cols_calendar2:
        calendar[col] = calendar[col].astype('category')
    return calendar

In [21]:
#export
@log_step
def melt_sales_data(sales):
    index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
    sales = sales.melt(id_vars=index_columns, var_name="date", value_name="sales")
    return sales

In [22]:
#export
@log_step
def add_forecast_days(sales, n_items=3049):
    subset = sales.loc[sales['date'].isin(['d_' + str(i) for i in range(1,29)])].copy()
    subset['date'] = np.ravel([['d_' + str(i)] * n_items*10 for i in range(1942,1970)])
    subset['sales'] = np.nan
    sales = pd.concat([sales,subset], axis=0).reset_index(drop=True)
    return sales

In [23]:
#export
@log_step
def reduce_memory(sales):
    index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
    for col in index_columns:
        sales[col] = sales[col].astype('category')
    sales = reduce_mem_usage(sales)
    return sales

In [24]:
#export
@log_step
def create_release_date_column(sales,prices):
    release_df = prices.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id','item_id','release']
    sales = merge_by_concat(sales, release_df, ['store_id','item_id'])
    return sales

In [25]:
#export
@log_step
def merge_sales_calendar(sales,calendar):
    sales = sales.rename(columns={"date":"d"})
    sales = merge_by_concat(sales, calendar, ["d"])
    return sales

In [26]:
#export
@log_step
def filter_out_sales_before_release_date(sales):
    # Filter out rows for which date < item release date (not "real zeros")
    sales = sales[sales['wm_yr_wk']>=sales['release']]
    sales = sales.reset_index(drop=True)
    # Normalize release column
    sales['release'] = sales['release'] - sales['release'].min()
    sales['release'] = sales['release'].astype(np.int16)
    return sales

In [27]:
#export
@log_step
def merge_sales_prices(sales,prices):
    sales = merge_by_concat(sales, prices, ["store_id","item_id","wm_yr_wk"])
    sales = sales.drop(["d"], axis=1)
    return sales

In [28]:
sales_df = sales_df.pipe(melt_sales_data) \
                   .pipe(add_forecast_days, n_items = subset_size) \
                   .pipe(reduce_memory) \
                   .pipe(create_release_date_column, prices = prices_df) \
                   .pipe(merge_sales_calendar, calendar = format_calendar_data(calendar_df)) \
                   .pipe(filter_out_sales_before_release_date) \
                   .pipe(merge_sales_prices, prices = prices_df)

INFO:root: -- Step melt_sales_data took 0:00:02.651604s - 2022-05-24 15:08:28.280837 --
INFO:root: -- Step add_forecast_days took 0:00:02.602924s - 2022-05-24 15:08:30.936186 --
INFO:root: -- Step reduce_memory took 0:00:04.602021s - 2022-05-24 15:08:33.686497 --


Mem. usage decreased to 101.50 Mb (25.0% reduction)


INFO:root: -- Step create_release_date_column took 0:00:03.604285s - 2022-05-24 15:08:38.289312 --
INFO:root: -- Step format_calendar_data took 0:00:00.037030s - 2022-05-24 15:08:41.931631 --
INFO:root: -- Step merge_sales_calendar took 0:00:01.678634s - 2022-05-24 15:08:41.969969 --
INFO:root: -- Step filter_out_sales_before_release_date took 0:00:00.791945s - 2022-05-24 15:08:43.696295 --
INFO:root: -- Step merge_sales_prices took 0:00:05.371927s - 2022-05-24 15:08:44.559812 --


Now the dataset is ready for feature calculations.

In [29]:
print(sales_df.shape)
sales_df.head()

(4652320, 18)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,sales,release,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_015_CA_1_evaluation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,4.0,0,2011-01-29,11101,,,,,0,0,0,0.7
1,HOBBIES_1_029_CA_1_evaluation,HOBBIES_1_029,HOBBIES_1,HOBBIES,CA_1,CA,2.0,0,2011-01-29,11101,,,,,0,0,0,7.44
2,HOBBIES_1_036_CA_1_evaluation,HOBBIES_1_036,HOBBIES_1,HOBBIES,CA_1,CA,2.0,0,2011-01-29,11101,,,,,0,0,0,0.96
3,HOBBIES_1_044_CA_1_evaluation,HOBBIES_1_044,HOBBIES_1,HOBBIES,CA_1,CA,3.0,0,2011-01-29,11101,,,,,0,0,0,1.51
4,HOBBIES_1_055_CA_1_evaluation,HOBBIES_1_055,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0,2011-01-29,11101,,,,,0,0,0,7.44


In [30]:
#hide
del calendar_df
del prices_df

## Feature engineering

The idea is to compute a large number of features among those mentioned in the Kaggle discussions. These features won't be all kept for predictions, they will be selected later. 

### Time features

We first compute classical time features : day, week, month, year...

In [31]:
# export
@log_step
def time_series_features(df, name_date_col):

#    Function calculating some feature related to the date.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (name_date_col): the name of date column
#    Returns:
#        df: the input dataframe with nice features.   

    df[name_date_col] = pd.to_datetime(df[name_date_col], format='%Y-%m-%d')

    if df[name_date_col].dtype != '<M8[ns]': # FONCTION PANDAS ERREUR SI PAS DATETIME
        raise ValueError('{} must be a datetime column '.format(name_date_col))
    
    df["day"] = df[name_date_col].dt.day.astype(np.int8)
    df['week'] = df[name_date_col].dt.week.astype(np.int8)
    df["month"] = df[name_date_col].dt.month.astype(np.int8)
    df["year"] = df[name_date_col].dt.year
    
    df["dayofweek"] = df[name_date_col].dt.dayofweek.astype(np.int8)
    df['weekend'] = (df["dayofweek"]>=5).astype(np.int8)
    df["dayofyear"] = df[name_date_col].dt.dayofyear.astype(np.int16)

    # Be careful to have the same min_date in your train, test dataset.
    # Especially in production if you don't train and predict at the same moment.
    first_day = df[name_date_col].min()

    df["date_block_num_month"] = ((df[name_date_col] - first_day) / np.timedelta64(1, 'M'))
    df["date_block_num_week"] = ((df[name_date_col] - first_day) / np.timedelta64(1, 'W'))
    df["date_block_num_day"] = ((df[name_date_col] - first_day) / np.timedelta64(1, 'D'))
    
    df['date_block_num_month'] = df['date_block_num_month'].astype(int)
    df['date_block_num_week'] = df['date_block_num_week'].astype(int)
    df['date_block_num_day'] = df['date_block_num_day'].astype(int)
    
    df["day_temp"] = 1
    df["year_month"] = pd.to_datetime(dict(year=df.year, month=df.month, day=df.day_temp))
    df = df.drop("day_temp",axis=1)
    
    df["year"] = (df["year"] - df["year"].min()).astype(np.int8)
    
    return df

In [32]:
input_df = sales_df.copy()

features_time = input_df.pipe(time_series_features, 'date') \
                        .pipe(reduce_mem_usage)

INFO:root: -- Step time_series_features took 0:00:05.662759s - 2022-05-24 15:08:51.367356 --


Mem. usage decreased to 230.81 Mb (37.3% reduction)


In [33]:
# Display features just calculated, with type and NaN count
features_time[set(features_time.columns) - set(sales_df.columns)].info(null_counts=True,memory_usage=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4652320 entries, 0 to 4652319
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   year_month            4652320 non-null  datetime64[ns]
 1   month                 4652320 non-null  int8          
 2   date_block_num_day    4652320 non-null  int16         
 3   weekend               4652320 non-null  int8          
 4   date_block_num_week   4652320 non-null  int16         
 5   week                  4652320 non-null  int8          
 6   dayofweek             4652320 non-null  int8          
 7   dayofyear             4652320 non-null  int16         
 8   date_block_num_month  4652320 non-null  int8          
 9   day                   4652320 non-null  int8          
 10  year                  4652320 non-null  int8          
dtypes: datetime64[ns](1), int16(3), int8(7)

### Cumulative means of sales in similar periods in the past

Then, we compute cumulative averages of sales over similar periods back in time, at different levels of aggregation.

The periods considered are : same week (1-52) / same day of the month (1-31) / same month (1-12), and the aggregation levels are defined in the `list_agg_levels` list below. 

For example, if a given row corresponds to the sales of product **HOBBIES_1_004** in store **CA_1** on a day of **week 33** of **year 2016**, the feature `cum_mean_week_store_id_item_id` will be equal to the average sales of the same product in the same store **over weeks 33 of years 2011 to 2015**. The current year (2016 here) is excluded from the average because it won't be available on the prediction set.

In [34]:
#export
def cum_mean_similar_periods(df, period:str, agg_level:list, target_name:str):

#    Function calculating the cumulative mean of sales at a given level of aggregation,
#    on similar periods in the past (ex: same week on previous years)
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (period): name of the period on which cumulative means will be computed: 
#                            - 'day' : same day of the month
#                            - 'week' : same week of the year
#                            - 'month' : same month of the year
#        param3 (agg_level): list of column names to define the aggregation level
#        param4 (target_name): name of the target
#    Returns:
#        df: the input dataframe with nice features.
    
    def sum_div1000(x):
        # This function is only used to avoid getting "inf" when sums become too large
        return 0.001 * sum(x)
    
    agg_cols = [period] + agg_level
    group = df.groupby(['year'] + agg_cols)[target_name].agg(['count',sum_div1000]).reset_index().dropna(subset=['sum_div1000'])
    cumsums = group.groupby(agg_cols).cumsum()[['count','sum_div1000']]
    cumsums.columns = ['cum_count','cum_sum']
    group2 = group.merge(cumsums, how='left', left_index=True, right_index=True)
    group2['cum_mean_' + '_'.join(agg_cols)] = 1000 * group2['cum_sum'] / group2['cum_count']
    group2 = group2[['year'] + agg_cols + ['cum_mean_' + '_'.join(agg_cols)]]
    group2['year'] = group2['year'].map(lambda x : x+1)
    
    df = df.merge(group2, how='left', on=['year']+agg_cols)
    
    return df

In [35]:
#export
@log_step
def cum_mean_similar_periods_calculation(input_df, list_periods, list_agg_levels, target_name):
    df = input_df.copy()
    for period in list_periods:
        for agg_level in list_agg_levels:
            df = cum_mean_similar_periods(df, period, agg_level, target_name)
    return df

In [36]:
TARGET_NAME = 'sales'
list_periods = ['week','day','month']
list_agg_levels = [   ['item_id']]

In [37]:
input_cols = ['sales','year','week','day','month','dayofyear','state_id','store_id','cat_id','dept_id','item_id']
input_df = get_input_df(input_cols)

features_cum_mean = input_df.pipe(cum_mean_similar_periods_calculation, list_periods, list_agg_levels, TARGET_NAME) \
                            .pipe(feature_formatting, input_cols)

INFO:root: -- Step cum_mean_similar_periods_calculation took 0:00:12.320478s - 2022-05-24 15:08:59.635713 --


Mem. usage decreased to 79.95 Mb (50.0% reduction)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4652320 entries, 0 to 4652319
Data columns (total 3 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   cum_mean_week_item_id   3694378 non-null  float16
 1   cum_mean_day_item_id    3907936 non-null  float16
 2   cum_mean_month_item_id  3740233 non-null  float16
dtypes: float16(3)None


### Pricing features

We then compute pricing features of different kinds.

1) Relative price difference within a group.

In [38]:
#export
def pricing_features_1(df_i, list_col_agg:list, target_var:str, output_var:str):

#    Function calculating the price relative difference of products within a group.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (list_col_agg): the list of columns you want to apply the aggregagtion on
#        param3 (target_var): the price column on which to calculate the relative difference
#        param4 (output_var): output column name
#    Returns:
#        df: the input dataframe with nice features.

    df_agg = df_i.groupby(list_col_agg).agg({target_var:np.nanmean}).reset_index()
    df_agg = df_agg.rename(columns={target_var: "avg_target"})
    df_i = pd.merge(df_i, df_agg, on=list_col_agg, how='left')
    df_i[output_var] = np.where(df_i["avg_target"] == 0\
                                ,0\
                                ,(df_i[target_var]-df_i["avg_target"])/df_i["avg_target"])
    df_i = df_i.drop(["avg_target"],axis=1)

    return df_i 

In [39]:
#export
@log_step
def pricing_features_1_calculation(input_df, list_dict):
    df = input_df.copy()
    for dict_i in list_dict:
        df = pricing_features_1(df_i = df,
                                      list_col_agg = dict_i["list_col_agg"],
                                      target_var = dict_i["target_var"],
                                      output_var = dict_i["output_var"])
    return df

In [40]:
list_dict_1 = [   {"list_col_agg":["year_month","item_id"]
                  ,"target_var":"sell_price"
                  ,"output_var":"diff_price_same_month_item"}

                 ,{"list_col_agg":["date","item_id"]
                  ,"target_var":"sell_price"
                  ,"output_var":"diff_price_same_day_item"}

                 ,{"list_col_agg":["item_id"]
                  ,"target_var":"sell_price"
                  ,"output_var":"diff_price_same_item"} ]

2) Simple aggregations, at the item x store level : min, max, mean, std, nunique.

In [41]:
#export
def pricing_features_2(df_i, list_col_agg: list, aggregator, target_var: str, output_var: str):

#    Function calculating basic aggregation statistics on prices.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (list_col_agg): the list of columns you want to apply the aggregagtion on
#        param3 (aggregator): the aggregation function (min, max, std, mean, median...)
#        param4 (target_var): the price column you want to aggregate
#        param5 (output_var): output column name
#    Returns:
#       df: the input dataframe with nice features.

    df_agg = df_i.groupby(list_col_agg).agg({target_var:aggregator}).reset_index()
    df_agg = df_agg.rename(columns={target_var:output_var})
    df_i = pd.merge(df_i, df_agg, on=list_col_agg, how='left')

    return df_i 

In [42]:
#export
@log_step
def pricing_features_2_calculation(df, list_dict):
    for dict_i in list_dict:
        df = pricing_features_2(df_i=df,
                                aggregator=dict_i["aggregator"],
                                list_col_agg=dict_i["list_col_agg"],
                                target_var=dict_i["target_var"],
                                output_var=dict_i["output_var"])
    return df

In [43]:
#hide
def nunique(liste):
    return len(np.unique(liste))

In [44]:
list_dict_2 = [   {"list_col_agg":["store_id","item_id"]
                  ,"aggregator": np.nanmin
                  ,"target_var":"sell_price"
                  ,"output_var":"min_price_same_store_item"}

                 ,{"list_col_agg":["store_id","item_id"]
                  ,"aggregator": np.nanmax
                  ,"target_var":"sell_price"
                  ,"output_var":"max_price_same_store_item"}

                 ,{"list_col_agg":["store_id","item_id"]
                  ,"aggregator": np.nanmean
                  ,"target_var":"sell_price"
                  ,"output_var":"mean_price_same_store_item"}

                 ,{"list_col_agg":["store_id","item_id"]
                  ,"aggregator": np.nanstd
                  ,"target_var":"sell_price"
                  ,"output_var":"std_price_same_store_item"}

                 ,{"list_col_agg":["store_id","item_id"]
                  ,"aggregator": nunique
                  ,"target_var":"sell_price"
                  ,"output_var":"nunique_price_same_store_item"} ]

3) Price momentum and normalized price, which are among the price features that perform the best.

In [45]:
#export
@log_step
def pricing_features_3(df, prices, calendar):

#    Function calculating price momentum and normalized price.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (prices): input dataframe prices_df
#        param3 (calendar): input dataframe calendar_df
#    Returns:
#        df: the input dataframe with nice features.
    
    # Price max normalization
    df['price_norm'] = df['sell_price'] / df['max_price_same_store_item']
    
    # Import calendar and prices dataframes
    calendar_prices = calendar[['wm_yr_wk','month','year']].drop_duplicates(subset=['wm_yr_wk'])
    prices_df = prices.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
    
    # Price momentum : kind of price derivative
    prices_df['price_momentum'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
    
    # Ratio / average monthly price by item x store
    prices_df['price_momentum_m'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
    
    # Ratio / average yearly price by item x store
    prices_df['price_momentum_y'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')
    
    # Merge everything together
    prices_df = prices_df[["store_id","item_id","wm_yr_wk","price_momentum","price_momentum_m","price_momentum_y"]]
    df = merge_by_concat(df, prices_df, ["store_id","item_id","wm_yr_wk"])

    return df

Let's calculate all these features successively.

In [46]:
input_cols = ['sell_price','date','year_month','wm_yr_wk','store_id','cat_id','dept_id','item_id']
input_df = get_input_df(input_cols)

features_prices = input_df.pipe(pricing_features_1_calculation, list_dict_1) \
                          .pipe(pricing_features_2_calculation, list_dict_2) \
                          .pipe(pricing_features_3, prices = pd.read_csv(f"{prices_path}"),
                                                    calendar = pd.read_csv(f"{calendar_path}")) \
                          .pipe(feature_formatting, input_cols)

INFO:root: -- Step pricing_features_1_calculation took 0:00:24.323651s - 2022-05-24 15:09:18.482951 --
INFO:root: -- Step pricing_features_2_calculation took 0:00:11.795279s - 2022-05-24 15:09:42.809812 --
INFO:root: -- Step pricing_features_3 took 0:00:28.303708s - 2022-05-24 15:09:57.793880 --


Mem. usage decreased to 155.37 Mb (47.0% reduction)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4652320 entries, 0 to 4652319
Data columns (total 12 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   diff_price_same_month_item     4652320 non-null  float16
 1   diff_price_same_day_item       4652320 non-null  float16
 2   diff_price_same_item           4652320 non-null  float16
 3   min_price_same_store_item      4652320 non-null  float16
 4   max_price_same_store_item      4652320 non-null  float16
 5   mean_price_same_store_item     4652320 non-null  float16
 6   std_price_same_store_item      4652320 non-null  float16
 7   nunique_price_same_store_item  4652320 non-null  int8   
 8   price_norm                     4652320 non-null  float16
 9   price_momentum                 4631320 non-null  float16
 10  price_momentum_m               4652320 non-null  float16
 11  price_momentum_y        

### Lags of target variable

The following lags of the target variable are computed :
 - Days : 8, 9, 10, ..., 35
 - Years : 1
 
Some rolling means are also calculated :
- 8-14 days, 8-21 days, 8-28 days, 8-35 days
- 15-22 days, 15-29 days, 15-36 days, 15-43 days
- 22-29 days, 22-36 days, 22-43 days, 22-50 days
- 29-35 days, 29-42 days, 29-49 days, 29-56 days

And some rolling standard deviations as well :
- 8-14 days, 8-35 days
- 15-22 days, 15-43 days
- 22-29 days, 22-50 days
- 29-35 days, 29-56 days

Lags from day 1 to day 7 are not calculated because the prediction horizon will be at least one week, therefore these features wouldn't be possible to get for the prediction set. 

In [47]:
#export
@log_step
def lags_features(df, merge_cols:list, lags:list, target_var:str, lag_type:str):

#    Function calculating the lags features of a given column.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (merge_cols): the list of columns you want to apply the aggregagtion on
#        param3 (lags): the list of lags being integers
#        param4 (target_var): the columns you want to apply the lags on
#        param5 (lag_type): lag_type directly related to the datetime.timedelta function
#    Returns:
#        df: the input dataframe with nice features.

    if not isinstance(merge_cols, list):
        raise ValueError('merge_cols must be a list.')
    if not isinstance(lags, list):
        raise ValueError('lags must be a list.')
    if lag_type not in ['days', 'weeks', 'months']:
        raise ValueError('lag_type is not valid, either "days", "weeks" or "months" ')
    
    tmp = df[merge_cols + [target_var]]
    
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = merge_cols + [target_var + '_lag_{}_'.format(lag_type) + str(i)]
        shifted['date'] += datetime.timedelta(**{lag_type: i})
        df = pd.merge(df, shifted, on=merge_cols, how='left')
        df[target_var + '_lag_{}_'.format(lag_type) + str(i)] = df[target_var + '_lag_{}_'.format(lag_type) + str(i)].fillna(0).astype(int)
    
    return df

In [48]:
#export
@log_step
def rolling_lags_features(df, lag_type:str, days_origin:list, target_var:str):

#    Function calculating rolling lags.
#    Args:
#        param1 (df): a pandas dataframe with lags features already calculated.
#        param2 (lag_type): type of lags considered (directly related to the datetime.timedelta function)
#        param3 (days_origin): list of days from which rolling periods start
#        param4 (target_var): the name of the target variable, on which lags are calculated
#    Returns:
#        df: the input dataframe with nice features.

    for day_origin in days_origin:
    
        rolling_period_list = [day_origin+6,day_origin+13,day_origin+20,day_origin+27]

        # Mean
        for rolling_period in rolling_period_list:
            col_name = f"mean_{day_origin}-{rolling_period}_{lag_type}_{target_var}"
            list_lags = []
            for i in range(day_origin,rolling_period+1):
                list_lags.append(f"{target_var}_lag_{lag_type}_{i}")
            df[col_name] = df[list_lags].mean(axis=1)

        # Std
        for rolling_period in [day_origin+6,day_origin+27]:
            col_name = f"std_{day_origin}-{rolling_period}_{lag_type}_{target_var}"
            list_lags = []
            for i in range(day_origin,rolling_period+1):
                list_lags.append(f"{target_var}_lag_{lag_type}_{i}")
            df[col_name] = df[list_lags].std(axis=1)
            
    # Remove some individual lags that are not needed
    cols_to_remove = [f"sales_lag_days_{i}" for i in range(36,57)]
    df = df.drop(cols_to_remove, axis=1)
    
    return df

In [49]:
MERGE_COLS = ['date','store_id','item_id']
LAGS = list(range(8,57)) + [365,366]
TARGET_NAME = 'sales'
LAG_TYPE = 'days'
DAYS_ORIGIN = [8,15,22,29]

In [50]:
input_cols = ['sales','date','store_id','item_id']
input_df = get_input_df(input_cols)

features_lags = input_df.pipe(lags_features, MERGE_COLS, LAGS, TARGET_NAME, LAG_TYPE) \
                        .pipe(rolling_lags_features, LAG_TYPE, DAYS_ORIGIN, TARGET_NAME) \
                        .pipe(feature_formatting, input_cols)

INFO:root: -- Step lags_features took 0:07:09.997227s - 2022-05-24 15:10:32.372023 --
INFO:root: -- Step rolling_lags_features took 0:07:46.457791s - 2022-05-24 15:17:42.593051 --


Mem. usage decreased to 532.50 Mb (73.0% reduction)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4652320 entries, 0 to 4652319
Data columns (total 54 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   sales_lag_days_8       4652320 non-null  int16  
 1   sales_lag_days_9       4652320 non-null  int16  
 2   sales_lag_days_10      4652320 non-null  int16  
 3   sales_lag_days_11      4652320 non-null  int16  
 4   sales_lag_days_12      4652320 non-null  int16  
 5   sales_lag_days_13      4652320 non-null  int16  
 6   sales_lag_days_14      4652320 non-null  int16  
 7   sales_lag_days_15      4652320 non-null  int16  
 8   sales_lag_days_16      4652320 non-null  int16  
 9   sales_lag_days_17      4652320 non-null  int16  
 10  sales_lag_days_18      4652320 non-null  int16  
 11  sales_lag_days_19      4652320 non-null  int16  
 12  sales_lag_days_20      4652320 non-null  int16  
 13  sales_lag_days_21   

### Lags of target variable aggregated

We can also aggregate sales by day at different levels (store, item, category...) and then compute lags of these aggregated sales.

In [51]:
#export
def lags_features_agg(df, target_name:str, avg_col_list:list, col_name:str, merge_cols:list, list_lags:list, lag_type:str):

#    Function calculating lags of aggregated values of the target variable.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (target_name): the name of the column you want to do the avg on
#        param2 (avg_col_list): the agg cols to use to do the avg
#        param4 (col_name): the name of the output column
#        param5 (merge_cols): the columns to apply the lags to
#        param6 (list_lags): the list of lags being integers
#        param7 (lag_type): lag_type directly related to the datetime.timedelta function
#    Returns:
#       df: the input dataframe with nice features.

    if not isinstance(avg_col_list, list):
        raise ValueError('avg_col_list must be a list.')
    if not isinstance(merge_cols, list):
        raise ValueError('merge_cols must be a list.')
    if not isinstance(list_lags, list):
        raise ValueError('list_lags must be a list.')
    if lag_type not in ['days', 'weeks', 'months']:
        raise ValueError('lag_type is not valid, either "days", "weeks" or "months".')

    group = df.groupby(avg_col_list).agg({target_name: ['mean']})
    group.columns = [col_name]
    group = group.reset_index()

    df = pd.merge(df, group, on=avg_col_list, how='left')
    df[col_name] = df[col_name].astype(np.float16)

    df = lags_features(df, merge_cols, list_lags, col_name, lag_type)
    df = df.drop([col_name], axis=1)

    return df

In [52]:
#export
@log_step
def lags_features_agg_calculation(input_df, target_name, merge_cols, lags, lag_type, list_dict):
    df = input_df.copy()
    for dict_i in list_dict:
        df = lags_features_agg(df = df,
                               target_name = target_name,
                               avg_col_list = dict_i['avg_col_list'],
                               col_name = dict_i['col_name'],
                               merge_cols = merge_cols,
                               list_lags = lags,
                               lag_type = lag_type)
    return df

In [53]:
TARGET_NAME = 'sales'
MERGE_COLS = ['date','store_id','item_id']
LAGS = [1,8,15,22,29,100,365]
LAG_TYPE = 'days'

list_dict = [{"avg_col_list":['date_block_num_day'],
              "col_name":"date_avg_sales"},

              {"avg_col_list":['date_block_num_day','store_id'],
               "col_name":"date_store_avg_sales"},

              {"avg_col_list":['date_block_num_day','item_id'],
               "col_name":'date_item_avg_sales'}]

In [54]:
input_cols = ['sales','date','store_id','item_id','dept_id','cat_id','state_id']
input_df = get_input_df(input_cols)

features_lags_agg = input_df.pipe(lags_features_agg_calculation, TARGET_NAME, MERGE_COLS, LAGS, LAG_TYPE, list_dict) \
                            .pipe(feature_formatting, input_cols)

INFO:root: -- Step lags_features took 0:01:20.055982s - 2022-05-24 15:30:38.695312 --
INFO:root: -- Step lags_features took 0:01:14.642291s - 2022-05-24 15:32:04.958726 --
INFO:root: -- Step lags_features took 0:02:50.615622s - 2022-05-24 15:33:49.948290 --
INFO:root: -- Step lags_features_agg_calculation took 0:06:14.727996s - 2022-05-24 15:30:31.921068 --


Mem. usage decreased to 146.50 Mb (81.7% reduction)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4652320 entries, 0 to 4652319
Data columns (total 21 columns):
 #   Column                             Non-Null Count    Dtype
---  ------                             --------------    -----
 0   date_avg_sales_lag_days_1          4652320 non-null  int8 
 1   date_avg_sales_lag_days_8          4652320 non-null  int8 
 2   date_avg_sales_lag_days_15         4652320 non-null  int8 
 3   date_avg_sales_lag_days_22         4652320 non-null  int8 
 4   date_avg_sales_lag_days_29         4652320 non-null  int8 
 5   date_avg_sales_lag_days_100        4652320 non-null  int8 
 6   date_avg_sales_lag_days_365        4652320 non-null  int8 
 7   date_store_avg_sales_lag_days_1    4652320 non-null  int8 
 8   date_store_avg_sales_lag_days_8    4652320 non-null  int8 
 9   date_store_avg_sales_lag_days_15   4652320 non-null  int8 
 10  date_store_avg_sales_lag_days_22   4652320 non-null  int8 
 11

### Target encoding for categorical features

Categorical features in the dataset need to be encoded (except if a catboost model is used). As some of them have a high cardinality (like item_id that can take more than 3000 distinct values), a one-hot encoding is not the prefered solution. 

A target encoding will be used instead : each modality of a given categorical feature is encoded by taking the average (or standard deviation) of sales among all rows of that modality. To avoid leakage, rows that will be used for validation or prediction are not considered when computing the mean or the std.

All the categorical features will be encoded, and some combinations of them as well (combinations defined in the `to_encode` list below).

In [55]:
#export
def target_encoding(df, target_name:str, cols_to_encode:list, encoding_func:list, exclude_column:str, exclude_values:list):

#    Function encoding a categorical feature (or a group of categorical features) based on aggregated target values.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (target_name): name of target column
#        param3 (cols_to_encode): list of the names of the columns to encode
#        param4 (encoding_func): list of the functions to use for encoding
#        param5 (exclude_column): name of column based on which some target values will be set to nan to avoid leakage
#        param6 (exclude_values): list of values defining the rows for which the target column should be set to nan
#    Returns:
#        df: the input dataframe with nice features.
    
    df_nan = df.copy()
    df_nan.loc[df_nan[exclude_column].isin(exclude_values),target_name] = np.nan
    group = df_nan.groupby(cols_to_encode)[target_name].agg(encoding_func).reset_index()
    new_col_names = ['_'.join(cols_to_encode) + '_' + func.__name__ + '_encod' for func in encoding_func]
    group.columns = cols_to_encode + new_col_names
    df = pd.merge(df, group, on=cols_to_encode, how='left')
    
    return df

In [56]:
#export
@log_step
def target_encoding_calculation(input_df, target_name, to_encode, encoding_functions, exclude_column, exclude_values):
    df = input_df.copy()
    for COLS in to_encode:
        df = target_encoding(df = df,
                             target_name = target_name,
                             cols_to_encode = COLS,
                             encoding_func = encoding_functions,
                             exclude_column = exclude_column,
                             exclude_values = exclude_values)
    return df

In [57]:
TARGET_NAME = 'sales'
ENCODING_FUNCTIONS = [np.nanmean, np.nanstd]
EXCLUDE_COLUMN = 'date_block_num_day'
EXCLUDE_VALUES = DAYS_VALID1 + DAYS_VALID2 + DAYS_VALID3 + DAYS_VALID4 + DAYS_VALID5 + DAYS_PRED

to_encode = [ ['item_id']]

In [58]:
input_cols = ['sales','state_id','store_id','cat_id','dept_id','item_id']
input_df = get_input_df(input_cols)

features_cat_encoding = input_df.pipe(target_encoding_calculation, TARGET_NAME, to_encode, ENCODING_FUNCTIONS, EXCLUDE_COLUMN, EXCLUDE_VALUES) \
                                .pipe(feature_formatting, input_cols)

INFO:root: -- Step target_encoding_calculation took 0:00:01.505591s - 2022-05-24 15:37:11.868689 --


Mem. usage decreased to 71.07 Mb (27.2% reduction)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4652320 entries, 0 to 4652319
Data columns (total 2 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   item_id_nanmean_encod  4652320 non-null  float16
 1   item_id_nanstd_encod   4652320 non-null  float16
dtypes: float16(2)None


### Dynamic target encoding for categorical features

The target encoding done just above is static : the average of sales is done on the whole dataset, at any time. A dynamic target encoding might be more accurate : it only takes into account sales that occured before the current date.

In [59]:
#export
def target_encoding_dynamic(df, target_name:str, cols_to_encode:list, date_column:str):

#    Function encoding a categorical feature (or a group of categorical features) based on aggregated target values.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (target_name): name of target column
#        param3 (cols_to_encode): list of the names of the columns to encode
#        param4 (date_column): name of date column
#    Returns:
#        df: the input dataframe with nice features.
    
    # This function is only used to avoid getting "inf" when sums become too large
    def sum_div100000(x):
        return 0.00001 * sum(x)
    
    group = df.groupby(cols_to_encode+[date_column])[target_name].agg(['count',sum_div100000]).reset_index().dropna(subset=['sum_div100000'])
    cumsums = group.groupby(cols_to_encode).cumsum()[['count','sum_div100000']]
    cumsums.columns = ['cum_count','cum_sum']
    group2 = group.merge(cumsums, how='left', left_index=True, right_index=True)
    group2['_'.join(cols_to_encode) + '_cum_mean_encod'] = 100000 * group2['cum_sum'] / group2['cum_count']
    group2 = group2[[date_column] + cols_to_encode + ['_'.join(cols_to_encode) + '_cum_mean_encod']]
    df = df.merge(group2, how='left', on=[date_column]+cols_to_encode)
    
    return df

In [60]:
#export
@log_step
def target_encoding_dynamic_calculation(input_df, target_name, date_column, to_encode):
    df = input_df.copy()
    for COLS in to_encode:
        df = target_encoding_dynamic(df = df,
                                     target_name = target_name,
                                     cols_to_encode = COLS,
                                     date_column = date_column)
    return df

In [61]:
TARGET_NAME = 'sales'
DATE_COLUMN = 'date_block_num_day'

to_encode = [ ['item_id'] ]

In [62]:
input_cols = ['sales','state_id','store_id','cat_id','dept_id','item_id']
input_df = get_input_df(input_cols)

features_cat_encoding_dyn = input_df.pipe(target_encoding_dynamic_calculation, TARGET_NAME, DATE_COLUMN, to_encode) \
                                    .pipe(feature_formatting, input_cols)

INFO:root: -- Step target_encoding_dynamic_calculation took 0:00:31.623542s - 2022-05-24 15:37:14.169961 --


Mem. usage decreased to 62.20 Mb (30.0% reduction)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4652320 entries, 0 to 4652319
Data columns (total 1 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   item_id_cum_mean_encod  4568320 non-null  float16
dtypes: float16(1)None


### Calendar events encoding

As suggested in several Kaggle discussions, the columns `event_name_1` and `event_name_2` are encoded as follows. One column is created for each type of event (ex: Christmas, Thanksgiving...). For each event, the feature is :

- Negative the previous 25 days (-25, -24, .. -1)
- Equal to 0 on the day of the event
- Positive for the next 25 days (1.2, .. 25)
- NaN otherwise

The objective is to model the impact of the event on sales before and after D-day.

In [63]:
#export
@log_step
def calendar_events_encoding(df, event_columns:list, date_col_name:str, days_range:list):

#    Function encoding events by adding one column by event to the orginal dataframe.
#    The value of the feature is negative before the event, equal to 0 the day of the event, and positive after.
#    Args:
#        param1 (df): a pandas dataframe
#        param2 (event_columns): list of columns to be encoded, whose values are events names
#        param3 (date_col_name): name of date column, where date is coded as an integer
#        param4 (days_range): list of integer defining the range of days taking negative or positive values around the event
#    Returns:
#        df: the input dataframe with nice features.
    
    all_events = list()
    for col in event_columns:
        all_events += df[col].dropna().unique()
    all_events = set(all_events)
    
    for event in all_events:
        
        event_days = list()
        for col in event_columns:
            event_days += list(df.loc[df[col] == event][date_col_name])
        event_days = set(event_days)
            
        for i in DAYS_RANGE:
            days = [d+i for d in event_days]
            df.loc[df[date_col_name].isin(days),event] = i
            
    return df

In [64]:
EVENT_COLS = ['event_name_1','event_name_2']
DATE_COL = 'date_block_num_day'
DAYS_RANGE = range(-25,26)

In [67]:
input_cols = ['event_name_1','event_name_2']
input_df = get_input_df(input_cols)

features_events_encoding = input_df.pipe(calendar_events_encoding, EVENT_COLS, DATE_COL, DAYS_RANGE) \
                                   .pipe(feature_formatting, input_cols)

INFO:root: -- Step calendar_events_encoding took 0:02:12.948402s - 2022-05-24 15:46:07.632562 --


Mem. usage decreased to 284.04 Mb (73.8% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4652320 entries, 0 to 4652319
Data columns (total 30 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Cinco De Mayo        725855 non-null  float16
 1   PresidentsDay        691154 non-null  float16
 2   SuperBowl            667423 non-null  float16
 3   ValentinesDay        680876 non-null  float16
 4   Pesach End           719821 non-null  float16
 5   Ramadan starts       705933 non-null  float16
 6   Eid al-Fitr          599891 non-null  float16
 7   NBAFinalsStart       709756 non-null  float16
 8   LentWeek2            702969 non-null  float16
 9   LaborDay             603637 non-null  float16
 10  NewYear              620818 non-null  float16
 11  Christmas            619468 non-null  float16
 12  StPatricksDay        707828 non-null  float16
 13  LentStart            699060 non-null  float16
 14  IndependenceDa

## Post-processing

### Merging all sub dataframes

The features have been stored in different dataframes. Let's merge them all.

In [68]:
all_df = [features_time, features_cum_mean, features_prices, features_lags, features_lags_agg, 
          features_cat_encoding, features_cat_encoding_dyn, features_events_encoding]
index_cols = ['id','date_block_num_day']
df_final = reduce(lambda left, right : merge_by_concat(left, right, index_cols), all_df)

### Rows and columns removal

We remove the first year because the lags are not calculated properly on it.

In [69]:
min_date_lag = df_final["date"].min() + timedelta(366)
df_final = df_final.loc[df_final["date"] > min_date_lag, :]

Then we drop useless columns.

In [70]:
list_col_to_drop = ['id','year_month','wm_yr_wk','event_name_1','event_type_1','event_name_2','event_type_2']
df_final = df_final.drop(list_col_to_drop, axis=1)

In [71]:
df_final.dtypes

item_id              category
dept_id              category
cat_id               category
store_id             category
state_id             category
                       ...   
MemorialDay           float16
VeteransDay           float16
Purim End             float16
OrthodoxChristmas     float16
EidAlAdha             float16
Length: 145, dtype: object

In [72]:
pd.to_pickle(df_final, "train_df_final_" + ".pkl")