In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from copy import deepcopy
from itertools import product
from ipywidgets import IntProgress
from IPython.display import display
import warnings

warnings.filterwarnings('ignore')

In [13]:
# download data
train = pd.read_csv('train_kaggle.csv')
test = pd.read_csv('test_kaggle.csv')
train['Date'] = pd.to_datetime(train['Date'], dayfirst = True)
test['Date'] = pd.to_datetime(test['Date'], dayfirst = True)
train.set_index(['Date'], inplace=True)
test.set_index(['Date'], inplace=True)

In [14]:
train['Promo'] = train['Promo'].fillna(0)
train.reset_index().set_index(['SKU_id', 'Store_id', 'Date'], inplace=True)
train['Regular_Price'] = train['Regular_Price'].ffill().bfill()
train.reset_index().set_index(['Date'], inplace=True)

# add actual price (promo price when promo occurred or regular price otherwise)

train['Actual_Price'] = train.Promo_Price.combine_first(train.Regular_Price) 
train.head()

Unnamed: 0_level_0,Store_id,SKU_id,Promo,Demand,Regular_Price,Promo_Price,Actual_Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-01,1,1,0.0,22,163.78,,163.78
2015-01-02,1,1,0.0,41,163.78,,163.78
2015-01-03,1,1,0.0,35,163.78,,163.78
2015-01-04,1,1,0.0,72,163.78,,163.78
2015-01-05,1,1,0.0,25,163.78,,163.78


In [15]:
test['Promo'] = test['Promo'].fillna(0)
test.reset_index().set_index(['SKU_id', 'Store_id', 'Date'], inplace=True)
test['Regular_Price'] = test['Regular_Price'].ffill().bfill()
test.reset_index().set_index(['Date'], inplace=True)

# add actual price (promo price when promo occurred or regular price otherwise)

test['Actual_Price'] = test.Promo_Price.combine_first(test.Regular_Price) 
test.head()

Unnamed: 0_level_0,Store_id,SKU_id,Promo,Demand,Regular_Price,Promo_Price,Actual_Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-23,1,1,1.0,,128.98,119.6,119.6
2016-05-24,1,1,0.0,,128.98,,128.98
2016-05-25,1,1,0.0,,131.7,,131.7
2016-05-26,1,1,0.0,,131.7,,131.7
2016-05-27,1,1,0.0,,131.7,,131.7


In [16]:
train.reset_index(inplace=True)
train["weekday"] = train.Date.dt.weekday
train["monthday"] = train.Date.dt.day
train['is_weekend'] = train.weekday.isin([5, 6]) * 1

test.reset_index(inplace=True)
test["weekday"] = test.Date.dt.weekday
test["monthday"] = test.Date.dt.day
test['is_weekend'] = test.weekday.isin([5, 6]) * 1

In [17]:
data = pd.concat([train, test]).set_index('Date').reset_index()
data.head()

Unnamed: 0,Date,Store_id,SKU_id,Promo,Demand,Regular_Price,Promo_Price,Actual_Price,weekday,monthday,is_weekend
0,2015-01-01,1,1,0.0,22.0,163.78,,163.78,3,1,0
1,2015-01-02,1,1,0.0,41.0,163.78,,163.78,4,2,0
2,2015-01-03,1,1,0.0,35.0,163.78,,163.78,5,3,1
3,2015-01-04,1,1,0.0,72.0,163.78,,163.78,6,4,1
4,2015-01-05,1,1,0.0,25.0,163.78,,163.78,0,5,0


In [18]:
def percentile(n):
    '''Calculate n - percentile of data'''
    def percentile_(x):
        return np.nanpercentile(x, n)
    percentile_.__name__ = 'perc%s' % n
    return percentile_


## add missing dates to GroupBy.Core object
def fill_missing_dates(x, min_date, max_date):
    groupby_day = x.groupby(pd.PeriodIndex(x.Date, freq='D'))
    results = groupby_day.sum(min_count=1)
    
    idx = pd.period_range(min_date, max_date)
    results = results.reindex(idx, fill_value=np.nan)
    
    results.index.rename('Date', inplace=True)
    
    return results

def calc_rolling(data, group_col, target_cols, date_col, preagg_method, method, w, dates):
    min_date, max_date = dates
    ## calc rolling stats
    lf_df_rolling = data.groupby(group_col).agg(preagg_method)[target_cols].reset_index().groupby(group_col[:-1]).\
    apply(lambda x: x.set_index(date_col).rolling(w, min_periods=1).agg(method)).drop(group_col[:-1], axis=1).reset_index()

    ## fill missing dates
    lf_df_filled = lf_df_rolling.groupby(group_col[:-1]).apply(fill_missing_dates, min_date=min_date, max_date=max_date).drop(group_col[:-1], axis=1)

    ## return DataFrame with rolled columns from target_vars
    return lf_df_filled



def calc_ewm(data, group_col, target_cols, date_col, preagg_method, alpha, dates):
    min_date, max_date = dates
    
    ## calc rolling stats
    lf_df_ewm = data.groupby(group_col).agg(preagg_method)[target_cols].reset_index().groupby(group_col[:-1]).\
    apply(lambda x: x.set_index(date_col).ewm(alpha=alpha).mean()).drop(group_col[:-1], axis=1).reset_index()
    ## fill missing dates
    lf_df_filled = lf_df_ewm.groupby(group_col[:-1]).apply(fill_missing_dates, min_date=min_date, max_date=max_date).drop(group_col[:-1], axis=1)

    ## return DataFrame with rolled columns from target_vars
    return lf_df_filled


def shift(lf_df_filled, group_col, date_col, lag):
    
    lf_df = lf_df_filled.groupby(level=group_col[:-1]).apply(lambda x: x.shift(lag)).reset_index()
    lf_df[date_col] = pd.to_datetime(lf_df[date_col].astype(str))

    ## return DataFrame with following columns: filter_col, id_cols, date_col, shifted rolling stats
    return lf_df


def lagged_features(data,
                    target_cols = ['Demand'],
                    id_cols = ['SKU_id'],
                    date_col = 'Date',
                    lags = [7, 14, 21, 28],
                    windows = ['7D', '14D', '28D', '56D'],
                    preagg_methods = ['mean'], # ['mean', 'count']
                    agg_methods = ['mean', 'median', percentile(10),  percentile(90)],
                    dynamic_filters = ['weekday'],
                    static_filters = None,
                    alphas = [0.1, 0.9]
                    ):
    
    '''Calculate lagged features '''
    ''' data - dataframe with default index
        target_cols - column names to calculate lags
        id_cols - key columns
        lags - lag values(days)
        windows - list of windows(string type), calculation is performed within time range length of window
        preagg_methods - applied methods before rolling
        agg_methods - method of aggregation, e.g. 'mean', 'median', percentile, etc.
        dynamic_filters - column names of filter
        static_filters - ...
        alphas - alpha values for ewm method
    '''
    
    data = data.sort_values(date_col)
    out_df = deepcopy(data)
    dates = [min(data[date_col]), max(data[date_col])]
    
    total = len(target_cols) * len(lags) * len(windows) * len(preagg_methods) * len(agg_methods) * len(dynamic_filters)
    progress = IntProgress(min=0, max=total)
    display(progress)
    
    
    for filter_col in dynamic_filters:
        group_col = group_col = [filter_col] + id_cols + [date_col]
        for lag in lags:         
            for preagg in preagg_methods:
                
                ## add ewm features
                for alpha in alphas:
                    lf_df_filled = calc_ewm(data, group_col, target_cols, date_col, preagg, alpha, dates)
                    lf_df = shift(lf_df_filled, group_col, date_col, lag)
                    new_names = {x: "{0}_lag{1}d_alpha{2}_key{3}_preag{4}_{5}_dynamic_ewm".
                                     format(x, lag, alpha, '_'.join(id_cols), preagg, filter_col) for x in target_cols}

                    out_df = pd.merge(out_df, lf_df.rename(columns=new_names), how='left', on=group_col)
            
                for w in windows: 
                    for method in agg_methods:
                        lf_df_filled = calc_rolling(data, group_col, target_cols, date_col, preagg, method, w, dates)

                        ## lf_df - DataFrame with following columns: filter_col, id_cols, date_col, shifted rolling stats
                        lf_df = shift(lf_df_filled, group_col, date_col, lag)

                        method_name = method.__name__ if type(method) != str else method

                        new_names = {x: "{0}_lag{1}d_w{2}_key{3}_preag{4}_ag{5}_{6}_dynamic_rolling".
                                     format(x, lag, w, '_'.join(id_cols), preagg, method_name, filter_col) for x in target_cols}

                        out_df = pd.merge(out_df, lf_df.rename(columns=new_names), how='left', on=group_col)
                        progress.value += 1
    
    return out_df



In [19]:
data_lagged_features = lagged_features(data 
                    , target_cols = ['Demand']
                    , id_cols = ['SKU_id', 'Store_id']
                    , date_col = 'Date'
                    , lags = [21, 28, 35, 42]
                    , windows = ['7D', '14D', '21D', '28D']
                    , preagg_methods = ['mean'] # ['mean', 'count']
                    , agg_methods = ['mean', 'median', percentile(10), percentile(90)]
                    , dynamic_filters = ['Promo', 'weekday']
                    , static_filters=[]
                    , alphas=[0.1, 0.9]
                    )

data_lagged_features


IntProgress(value=0, max=16)

Unnamed: 0,Date,Store_id,SKU_id,Promo,Demand,Regular_Price,Promo_Price,Actual_Price,weekday,monthday,...,Demand_lag35d_w7D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling,Demand_lag35d_w14D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling,Demand_lag35d_w21D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling,Demand_lag35d_w28D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling,Demand_lag42d_alpha0.1_keySKU_id_Store_id_preagmean_Promo_dynamic_ewm,Demand_lag42d_alpha0.9_keySKU_id_Store_id_preagmean_Promo_dynamic_ewm,Demand_lag42d_w7D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling,Demand_lag42d_w14D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling,Demand_lag42d_w21D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling,Demand_lag42d_w28D_keySKU_id_Store_id_preagmean_agmean_Promo_dynamic_rolling
0,2015-01-01,1,1,0.0,22.0,163.78,,163.78,3,1,...,,,,,,,,,,
1,2015-01-01,19,1,0.0,25.0,163.78,,163.78,3,1,...,,,,,,,,,,
2,2015-01-01,39,2,0.0,3.0,135.78,,135.78,3,1,...,,,,,,,,,,
3,2015-01-01,40,2,0.0,0.0,135.78,,135.78,3,1,...,,,,,,,,,,
4,2015-01-01,18,1,0.0,13.0,163.78,,163.78,3,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92881,2016-06-19,47,1,1.0,,134.96,125.04,125.04,6,19,...,,,,,,,,,,
92882,2016-06-19,47,2,0.0,,139.86,,139.86,6,19,...,2.500000,2.142857,1.384615,1.500000,,,,,,
92883,2016-06-19,48,2,0.0,,137.14,,137.14,6,19,...,4.166667,4.857143,4.230769,4.357143,,,,,,
92884,2016-06-19,45,2,0.0,,139.86,,139.86,6,19,...,23.500000,20.142857,20.272727,19.083333,,,,,,
