In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
from datetime import date
import scipy

In [2]:
from process import process
from process import fix_nan

In [3]:
from process import add_all_holidays

In [11]:
raw_data = pd.read_csv("../demand-forecasting-in-retail/train.csv")

In [14]:
raw_data = fix_nan(raw_data)

In [13]:
# raw_data

In [6]:
def percentile(n):
    '''Calculate n - percentile of data'''
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'pctl%s' % n
    return percentile_

In [16]:
def lagged_features(df
                    , lags = [7, 14, 21, 28]
                    , windows = [7, 14]
                    , aggregation_methods = {'mean', 'median', percentile(10),  percentile(90)}
                    , promo_filters = [0, 1]
                    , target_var = 'demand'
                    , by_all_stores = False
                    , by_all_products = False):
    
    if len(df) == 0:
        return df
    
                
    
    # loop by filter variables and window
    for w in windows:

        # check whether filtered df in not empty
        if len(df) > 0:
            
            # lagged features calculation
            lf_df = df.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])\
                 [target_var].groupby(level=['product_rk','store_location_rk']).\
                apply(lambda x: x.rolling(window=w, min_periods=1).agg(aggregation_methods))

            # provide lags tranformations
            for l in lags:
                new_names = {x: "{3}_lag{0}_wdw{1}_{2}".
                              format(l, w, x, target_var) for x in lf_df.columns }

                df = df.merge(lf_df.shift(l).reset_index().rename(columns = new_names),
                    how='left', on =['product_rk', 'store_location_rk', 'period_start_dt'] )
                
                
            if by_all_stores:
                lf_df = df.set_index(['product_rk', 'period_start_dt']).\
                    demand.groupby(level=['product_rk', 'period_start_dt']).median().\
                    groupby(level=['product_rk']).\
                    apply(lambda x: x.rolling(window=w, min_periods=1).agg(aggregation_methods))

                for l in lags:
                    new_names = {x: "all_stores_{3}_lag{0}_wdw{1}_{2}".
                                  format(l, w, x, target_var) for x in lf_df.columns }

                    df = df.merge(lf_df.shift(l).reset_index().rename(columns = new_names),
                        how='left', on =['product_rk', 'period_start_dt'] )
                    
            if by_all_products:
                lf_df = df.set_index(['store_location_rk', 'period_start_dt'])[target_var]\
                .groupby(level=['store_location_rk', 'period_start_dt']).median().\
                    groupby(level=['store_location_rk']).\
                    apply(lambda x: x.rolling(window=w, min_periods=1).agg(aggregation_methods))

                for l in lags:
                    new_names = {x: "all_products_{3}_lag{0}_wdw{1}_{2}".
                                  format(l, w, x, target_var) for x in lf_df.columns }

                    df = df.merge(lf_df.shift(l).reset_index().rename(columns = new_names),
                        how='left', on =['store_location_rk', 'period_start_dt'] )

    return df

In [17]:
lagged_features(raw_data, target_var='PRICE_REGULAR')

Unnamed: 0.1,Unnamed: 0,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,PRICE_REGULAR_lag7_wdw7_pctl90,...,PRICE_REGULAR_lag14_wdw14_median,PRICE_REGULAR_lag14_wdw14_mean,PRICE_REGULAR_lag21_wdw14_pctl90,PRICE_REGULAR_lag21_wdw14_pctl10,PRICE_REGULAR_lag21_wdw14_median,PRICE_REGULAR_lag21_wdw14_mean,PRICE_REGULAR_lag28_wdw14_pctl90,PRICE_REGULAR_lag28_wdw14_pctl10,PRICE_REGULAR_lag28_wdw14_median,PRICE_REGULAR_lag28_wdw14_mean
0,0,40369,309,2016-12-19,29.0,0.0,,,0.0,,...,,,,,,,,,,
1,1,40370,309,2016-12-19,64.0,0.0,,,0.0,,...,,,,,,,,,,
2,2,40372,309,2016-12-19,32.0,0.0,,,0.0,,...,,,,,,,,,,
3,3,40373,309,2016-12-19,10.0,0.0,,,0.0,,...,,,,,,,,,,
4,4,46272,309,2016-12-19,15.0,0.0,,,0.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,1000.00,1000.0,1.0,500.00,...,70.0,83.848673,284.290000,253.285714,284.29,276.667959,3000.000000,3000.000000,3000.00,3000.000000
35340,35538,40372,1380,2019-12-30,,0.0,2000.00,2000.0,1.0,1000.00,...,500.0,500.000000,108.200714,70.000000,70.00,79.338367,284.290000,253.285714,284.29,277.718673
35341,35539,40373,1380,2019-12-30,,0.0,3000.00,3000.0,1.0,2000.00,...,1000.0,1000.000000,500.000000,500.000000,500.00,500.000000,92.785714,70.000000,70.00,76.307653
35342,35540,46272,1380,2019-12-30,,1.0,284.29,199.0,1.0,3000.00,...,2000.0,2000.000000,1000.000000,1000.000000,1000.00,1000.000000,500.000000,500.000000,500.00,500.000000
