<a href="https://colab.research.google.com/github/aromanenko/ATSF/blob/wip/hw3_solution_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import libs

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
# %matplotlib inline
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
pd.options.plotting.backend = "plotly"

# upload and investigate data

In [25]:
# upload train.csv file,
# it contains train and test samplesa
all_data = pd.read_csv('train.csv', delimiter=',')

# connvert date-column to data format
all_data['period_start_dt'] = pd.to_datetime(all_data['period_start_dt'], format= "%Y-%m-%d")
all_data.head()

Unnamed: 0.1,Unnamed: 0,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,,,,,,
1,1,40370,309,2016-12-19,64.0,,,,,,
2,2,40372,309,2016-12-19,32.0,,,,,,
3,3,40373,309,2016-12-19,10.0,,,,,,
4,4,46272,309,2016-12-19,15.0,,,,,,


In [26]:
# rename "Unnamed: 0" to id (it's needed to manage train/and)
all_data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
all_data

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,,,,,,
1,1,40370,309,2016-12-19,64.0,,,,,,
2,2,40372,309,2016-12-19,32.0,,,,,,
3,3,40373,309,2016-12-19,10.0,,,,,,
4,4,46272,309,2016-12-19,15.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,0.0,1000.00,1000.0,0.0,1.0
35340,35538,40372,1380,2019-12-30,,0.0,0.0,2000.00,2000.0,0.0,1.0
35341,35539,40373,1380,2019-12-30,,0.0,0.0,3000.00,3000.0,0.0,1.0
35342,35540,46272,1380,2019-12-30,,1.0,0.0,284.29,199.0,0.0,1.0


In [27]:
# investigate data holiscitly
all_data.describe()

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PROMO2_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,NUM_CONSULTANT,AUTORIZATION_FLAG
count,35344.0,35344.0,35344.0,35344,34144.0,35159.0,35159.0,34217.0,34212.0,35159.0,35159.0
mean,17766.554012,49253.732232,844.240154,2018-07-24 16:17:19.746491904,12.245636,0.206434,0.0,1167.679357,1155.778351,0.0,0.907677
min,0.0,40369.0,309.0,2016-12-19 00:00:00,0.0,0.0,0.0,49.0,8.647059,0.0,0.0
25%,8881.75,40370.0,535.0,2017-11-06 00:00:00,2.0,0.0,0.0,284.29,199.0,0.0,1.0
50%,17770.5,40372.0,862.0,2018-07-30 00:00:00,6.0,0.0,0.0,1000.0,1000.0,0.0,1.0
75%,26647.25,46272.0,1173.0,2019-04-22 00:00:00,12.0,0.0,0.0,2000.0,2000.0,0.0,1.0
max,35541.0,96212.0,1380.0,2019-12-30 00:00:00,1160.0,2.0,0.0,3000.0,3000.0,0.0,1.0
std,10258.040738,19145.064867,333.22916,,32.604642,0.433393,0.0,1046.828551,1057.91283,0.0,0.289486


In [5]:
# draw some time series
def plot_some_ts(ts_df, groupby_columns, time_column, target_column, ts_num = 10, aggregation_method = 'sum'):
  '''pivot original ts by group_columns
     ts_df - original dataframe with ts data,
     group_columns - tuple with names of columns used to split data to time series, use None if splitting is not needed
     time_column - name of column with date, datetime64
     target_column - column with ts data (data should be in numerical format), string
     ts_num - number of ts to be drawn, int
     aggregation_method - aggregation method of data in target column, string
  '''

  if groupby_columns is None:
    ts_df[target_column + time_column + 'const'] = 1
    groupby_columns = [target_column + time_column + 'const']

  pivot_ts = ts_df.groupby(groupby_columns + [time_column]).agg(aggregation_method)

  # concat multiple index to single column
  index_column_name = ', '.join([groupby_columns[i]+'={0['+str(i)+']}' for i in range(len(groupby_columns))])
  pivot_ts.index = [pivot_ts.index.map(index_column_name.format) , pivot_ts.index.get_level_values(len(groupby_columns))]

  # unstack by-column (column that contains ts name)
  pivot_ts = pivot_ts.unstack([0])[target_column]

  # plot first ts_num ts
  fig = pivot_ts[pivot_ts.columns[:ts_num]].plot().update_layout(height=350, width=1300,
                                                  title="first {0} ts for {1} variable".format(ts_num, target_column ),
    xaxis_title=time_column,
    yaxis_title=target_column+ ' value',
    legend_title='ts id columns: '+', '.join(groupby_columns)).show()
  return fig

# data for 3 products (all stores-product level)
plot_some_ts(all_data, ['product_rk'], 'period_start_dt', 'demand', ts_num = 3)

In [6]:
# all stores - all products level
# pay attention to forecasting periods: since 2Dec2019
plot_some_ts(all_data, None, 'period_start_dt', 'demand', ts_num = 1)

# what data dependencies can you observe?

In [28]:
# investigate demand driver columns (explanatory variables )
print(all_data['PROMO1_FLAG'].unique())
print(all_data['PROMO2_FLAG'].unique()) #не используем данную переменную, т.к. значения только 0 и NaN
print(all_data['NUM_CONSULTANT'].unique()) #не используем данную переменную, т.к. значения только 0 и NaN
print(all_data['AUTORIZATION_FLAG'].unique())

[nan  1.  0.  2.]
[nan  0.]
[nan  0.]
[nan  1.  0.]


In [29]:
# remove those, which have only one unique value (not empty) (they do not provide any benefit when training the model)
del all_data['PROMO2_FLAG']
del all_data['NUM_CONSULTANT']
all_data

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,,,,
1,1,40370,309,2016-12-19,64.0,,,,
2,2,40372,309,2016-12-19,32.0,,,,
3,3,40373,309,2016-12-19,10.0,,,,
4,4,46272,309,2016-12-19,15.0,,,,
...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,1000.00,1000.0,1.0
35340,35538,40372,1380,2019-12-30,,0.0,2000.00,2000.0,1.0
35341,35539,40373,1380,2019-12-30,,0.0,3000.00,3000.0,1.0
35342,35540,46272,1380,2019-12-30,,1.0,284.29,199.0,1.0


# data preprocessing
 - do somethin with missing values
 - add some demand drivers to dataset

## fill missing values based on expert insights

In [30]:
# fill na in PROMO1_FLAG with mode-value (based on common-sense)
all_data['PROMO1_FLAG'] = all_data['PROMO1_FLAG'].fillna(all_data['PROMO1_FLAG'].mode()[0]) # most frequent value
all_data

Unnamed: 0,id,product_rk,store_location_rk,period_start_dt,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
0,0,40369,309,2016-12-19,29.0,0.0,,,
1,1,40370,309,2016-12-19,64.0,0.0,,,
2,2,40372,309,2016-12-19,32.0,0.0,,,
3,3,40373,309,2016-12-19,10.0,0.0,,,
4,4,46272,309,2016-12-19,15.0,0.0,,,
...,...,...,...,...,...,...,...,...,...
35339,35537,40370,1380,2019-12-30,,0.0,1000.00,1000.0,1.0
35340,35538,40372,1380,2019-12-30,,0.0,2000.00,2000.0,1.0
35341,35539,40373,1380,2019-12-30,,0.0,3000.00,3000.0,1.0
35342,35540,46272,1380,2019-12-30,,1.0,284.29,199.0,1.0


## fill missing values with prev/back info

In [31]:
# AUTORIZATION_FLAG - is product available at store at the moment
plot_some_ts(all_data, ['product_rk', 'store_location_rk'], 'period_start_dt', 'AUTORIZATION_FLAG', ts_num = 3)

In [49]:
# filling withh prev then next value in pandas
all_data.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])\
  .unstack([0,1])\
   ['PRICE_REGULAR'].\
   ffill().bfill().\
   stack([1,0], future_stack=True)\
   .rename('REGULAR_PRICE_FIXED')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,REGULAR_PRICE_FIXED
period_start_dt,store_location_rk,product_rk,Unnamed: 3_level_1
2016-12-19,309,40369,
2016-12-19,309,40370,
2016-12-19,309,40372,
2016-12-19,309,40373,
2016-12-19,309,46272,
...,...,...,...
2019-12-30,1380,40370,1000.00
2019-12-30,1380,40372,2000.00
2019-12-30,1380,40373,3000.00
2019-12-30,1380,96212,141.43


In [50]:
# fill na with prev (and if no prev then next) values
all_data = all_data.set_index(['product_rk', 'store_location_rk', 'period_start_dt']).\
  merge( all_data.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])\
            .unstack([0,1])\
            ['PRICE_REGULAR'].\
              ffill().bfill().\
            stack([1,0],  future_stack=True).\
            rename('PRICE_REGULAR_FIXED'),
         how = 'left', right_index = True, left_index = True)\
  .reset_index()

# dell original column and replace with new one
del all_data['PRICE_REGULAR']
all_data.rename(columns = {'PRICE_REGULAR_FIXED':'PRICE_REGULAR'}, inplace=True)
all_data.head()

Unnamed: 0,product_rk,store_location_rk,period_start_dt,id,demand,PROMO1_FLAG,PRICE_AFTER_DISC,AUTORIZATION_FLAG,PRICE_REGULAR
0,40369,309,2016-12-19,0,29.0,0.0,,,
1,40370,309,2016-12-19,1,64.0,0.0,,,
2,40372,309,2016-12-19,2,32.0,0.0,,,
3,40373,309,2016-12-19,3,10.0,0.0,,,
4,46272,309,2016-12-19,4,15.0,0.0,,,


In [51]:
def ts_fillna_ffill_bfill(ts_df,column_name, ts_id):
#  all_data.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])\ # define id columns
#   .unstack([0,1])\                          # df -> pivot transformation: date column - is row-index, product x store - is column index
#   ['PRICE_REGULAR'].\                      # define column with data to be fixed
#   ffill().bfill().\                        # fill missing value: apply forwand then back filling method consequently
#   stack([1,0]).\                           # pivot -> ts transformation
#   rename('REGULAR_PRICE_FIXED')            # rename column


  # fill na with prev (and if no prev then next) values
  new_ts_df = ts_df.set_index(ts_id).\
    merge(ts_df.set_index(ts_id)\
              .unstack([0,1])\
              [column_name].\
              ffill().bfill().\
              stack([1,0],  future_stack=True).\
              rename(column_name),
          how = 'left', right_index = True, left_index = True)\
    .reset_index()

  # dell original column and replace with new one
  del new_ts_df[column_name+'_x']
  return new_ts_df.rename(columns = {column_name+'_y':column_name})

In [52]:
all_data.head()

Unnamed: 0,product_rk,store_location_rk,period_start_dt,id,demand,PROMO1_FLAG,PRICE_AFTER_DISC,AUTORIZATION_FLAG,PRICE_REGULAR
0,40369,309,2016-12-19,0,29.0,0.0,,,
1,40370,309,2016-12-19,1,64.0,0.0,,,
2,40372,309,2016-12-19,2,32.0,0.0,,,
3,40373,309,2016-12-19,3,10.0,0.0,,,
4,46272,309,2016-12-19,4,15.0,0.0,,,


In [53]:
# the same fort PRICE_AFTER_DISCOUNT
all_data = ts_fillna_ffill_bfill(ts_df = all_data,column_name = 'PRICE_AFTER_DISC' , ts_id= ['product_rk', 'store_location_rk', 'period_start_dt'])


# the same fort AUTORIZATION_FLAG
all_data = ts_fillna_ffill_bfill(ts_df = all_data,column_name = 'AUTORIZATION_FLAG' , ts_id= ['product_rk', 'store_location_rk', 'period_start_dt'])

In [54]:
# look at data again
all_data.head()

Unnamed: 0,product_rk,store_location_rk,period_start_dt,id,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
0,40369,309,2016-12-19,0,29.0,0.0,,,
1,40370,309,2016-12-19,1,64.0,0.0,,,
2,40372,309,2016-12-19,2,32.0,0.0,,,
3,40373,309,2016-12-19,3,10.0,0.0,,,
4,46272,309,2016-12-19,4,15.0,0.0,,,


## fill missing values as average/mode/median from other stores

In [55]:
# find mean values for each pair product x date
values = all_data.set_index(['product_rk', 'period_start_dt', 'store_location_rk'])\
            .unstack([0,1])\
            ['PRICE_REGULAR'].\
              mean()

# replace missing values with mean in all stores
all_data.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])\
            .unstack([0,2])\
            ['PRICE_REGULAR'].\
            fillna(value = values).\
            stack([1,0], future_stack=True).\
            rename('REGULAR_PRICE_FIXED')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,REGULAR_PRICE_FIXED
store_location_rk,period_start_dt,product_rk,Unnamed: 3_level_1
309,2016-12-19,40369,500.00
309,2016-12-19,40370,1000.00
309,2016-12-19,40372,2000.00
309,2016-12-19,40373,3000.00
309,2016-12-19,46272,157.00
...,...,...,...
1380,2019-12-30,40372,2000.00
1380,2019-12-30,40373,3000.00
1380,2019-12-30,46272,284.29
1380,2019-12-30,96212,141.43


In [56]:
def ts_fillna_aggmethod(ts_df,column_name, ts_id):
#  all_data.set_index(['product_rk', 'store_location_rk', 'period_start_dt'])\ # define id columns
#   .unstack([0,1])\                          # df -> pivot transformation: date column - is row-index, product x store - is column index
#   ['PRICE_REGULAR'].\                      # define column with data to be fixed
#   ffill().bfill().\                        # fill missing value: apply forwand then back filling method consequently
#   stack([1,0]).\                           # pivot -> ts transformation
#   rename('REGULAR_PRICE_FIXED')            # rename column

  values = all_data.set_index(ts_id)\
            .unstack([0,1])\
            [column_name].\
              mean()

  # fill na with prev (and if no prev then next) values
  new_ts_df = ts_df.set_index(ts_id).\
    merge(ts_df.set_index(ts_id)\
              .unstack([0,1])\
              [column_name].\
              fillna(value = values).\
              stack([1,0], future_stack=True).\
              rename(column_name),
          how = 'left', right_index = True, left_index = True)\
    .reset_index()

  # dell original column and replace with new one
  del new_ts_df[column_name+'_x']
  return new_ts_df.rename(columns = {column_name+'_y':column_name})


# for PRICE_REGULAR
all_data = ts_fillna_aggmethod(ts_df = all_data,column_name = 'PRICE_REGULAR' , ts_id= ['product_rk', 'period_start_dt', 'store_location_rk'])


In [57]:
# the same fort PRICE_AFTER_DISCOUNT
all_data = ts_fillna_aggmethod(ts_df = all_data,column_name = 'PRICE_AFTER_DISC' , ts_id= ['product_rk', 'period_start_dt', 'store_location_rk'])


# the same fort AUTORIZATION_FLAG
all_data = ts_fillna_aggmethod(ts_df = all_data,column_name = 'AUTORIZATION_FLAG' , ts_id= ['product_rk', 'period_start_dt', 'store_location_rk'])

In [58]:
# check data again
all_data.isna().sum()

Unnamed: 0,0
product_rk,0
period_start_dt,0
store_location_rk,0
id,0
demand,1200
PROMO1_FLAG,0
PRICE_REGULAR,0
PRICE_AFTER_DISC,0
AUTORIZATION_FLAG,0


In [59]:
# let's delete store 309 related data
all_data = all_data[all_data['store_location_rk'] != 309]
all_data

Unnamed: 0,product_rk,period_start_dt,store_location_rk,id,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG
15,40369,2016-12-19,317,15,50.0,0.0,500.00,500.0,1.0
16,40370,2016-12-19,317,16,44.0,0.0,1000.00,1000.0,1.0
17,40372,2016-12-19,317,17,13.0,0.0,2000.00,2000.0,1.0
18,40373,2016-12-19,317,18,6.0,0.0,3000.00,3000.0,1.0
19,46272,2016-12-19,317,19,34.0,0.0,157.00,157.0,1.0
...,...,...,...,...,...,...,...,...,...
35339,40370,2019-12-30,1380,35537,,0.0,1000.00,1000.0,1.0
35340,40372,2019-12-30,1380,35538,,0.0,2000.00,2000.0,1.0
35341,40373,2019-12-30,1380,35539,,0.0,3000.00,3000.0,1.0
35342,46272,2019-12-30,1380,35540,,1.0,284.29,199.0,1.0


In [60]:
# check data again
# that's it
all_data.isna().sum()

Unnamed: 0,0
product_rk,0
period_start_dt,0
store_location_rk,0
id,0
demand,1200
PROMO1_FLAG,0
PRICE_REGULAR,0
PRICE_AFTER_DISC,0
AUTORIZATION_FLAG,0


## add calendar-feature

In [61]:
# all_data['period_start_dt'] = pd.to_datetime(all_data['period_start_dt'], dayfirst =False)
all_data["ind_of_year"] = [dt.year for dt in all_data.period_start_dt]
all_data["ind_of_month"] = [dt.month for dt in all_data.period_start_dt]
all_data["ind_of_day"] = [dt.day for dt in all_data.period_start_dt]
all_data.head()

Unnamed: 0,product_rk,period_start_dt,store_location_rk,id,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
15,40369,2016-12-19,317,15,50.0,0.0,500.0,500.0,1.0,2016,12,19
16,40370,2016-12-19,317,16,44.0,0.0,1000.0,1000.0,1.0,2016,12,19
17,40372,2016-12-19,317,17,13.0,0.0,2000.0,2000.0,1.0,2016,12,19
18,40373,2016-12-19,317,18,6.0,0.0,3000.0,3000.0,1.0,2016,12,19
19,46272,2016-12-19,317,19,34.0,0.0,157.0,157.0,1.0,2016,12,19


# train ML model

In [62]:
from ipywidgets import IntProgress

from itertools import product
def percentile(n):
    '''Calculate n - percentile of data'''
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'pctl%s' % n
    return percentile_

# add missing dates to GroupBy.Core object
def fill_missing_dates(x, date_col, freq = None, default_value = np.nan):

    if freq is None:
        try:
           freq = pd.infer_freq(x.set_index(date_col).index[:min(100, x.shape[0])])
        except:
           freq = 'D'

        if freq is None:
          freq = 'D'
          Warning('TS freq is not defined! Daily granularity is provided!')
    # print(freq)
    # new indexes without time breaks
    idx = pd.date_range(x[date_col].min(), x[date_col].max(), freq=freq)
    # print(idx)
    results = x.set_index(date_col).reindex(idx,fill_value = default_value)
    results.index.rename(date_col, inplace = True)

    # groupby_day = x.groupby(pd.PeriodIndex(x[date_col], freq='D'))
    # results = groupby_day.sum(min_count=1)

    # idx = pd.period_range(min_date, max_date)
    # results = results.reindex(idx, fill_value=default_value)

    # results.index.rename(date_col, inplace=True)

    return results.reset_index()


def calc_preag_fill(data, group_col, date_col, target_cols, preagg_method):
    ## calc preaggregation
    data_preag = data.groupby(group_col).agg(
        preagg_method)[target_cols].reset_index()

    ## fill missing dates
    data_preag_filled = data_preag.groupby(group_col[:-1]).apply(
         fill_missing_dates, date_col=date_col).drop(group_col[:-1],
                                                     axis=1).reset_index()

    ## return DataFrame with calculated preaggregation and filled missing dates
    return data_preag # ,  data_preag_filled


def calc_rolling(data_preag_filled, group_col, date_col, method, w):

    ## calc rolling stats
    lf_df_filled = data_preag_filled.groupby(group_col[:-1]).\
        apply(lambda x: x.set_index(date_col).rolling(window=w, min_periods=1).agg(method)).drop(group_col[:-1], axis=1).reset_index(group_col)

    ## return DataFrame with rolled columns from target_vars
    return lf_df_filled

# ewma calculation method
def calc_ewm(data_preag_filled, group_col, date_col, span):
    ## calc ewm stats
    lf_df_filled = data_preag_filled.groupby(group_col[:-1]).\
        apply(lambda x: x.set_index(date_col).ewm(span=span).mean()).drop(group_col[:-1], axis=1).reset_index(group_col)

    ## return DataFrame with rolled columns from target_vars
    return lf_df_filled

# shift ts data
def shift(lf_df_filled, group_col, date_col, lag, kwargs = None):

    lf_df = (lf_df_filled.     # prepare calculations
        set_index(date_col).    # date column as time-index
        groupby(group_col[:-1]).       # ids_col to separate different time series
        apply(lambda x: x.shift(lag, kwargs)).    # aplly shift at lag steps
        drop(group_col[:-1], axis=1).                   # drop ids_col from time df to reset index
        reset_index()              # reset index to return df to no-index state
    )
    # lf_df[date_col] = pd.to_datetime(lf_df[date_col].astype(str))
    # print(lf_df.index)

    ## return DataFrame with following columns: filter_col, id_cols, date_col and shifted stats
    return lf_df


def generate_lagged_features(
        data: pd.DataFrame,
        target_cols: list = ['Demand'],
        id_cols: list = ['SKU_id', 'Store_id'],
        date_col: str = 'Date',
        lags: list = [7, 14, 21, 28],
        windows: list = ['7D', '14D', '28D', '56D'],
        preagg_methods: list = ['mean'],
        agg_methods: list = ['mean', 'median', percentile(10), pd.Series.skew],
        dynamic_filters: list = ['weekday', 'Promo'],
        ewm_params: dict = {'weekday': [14, 28], 'Promo': [14, 42]}) -> pd.DataFrame:

    '''
    data - dataframe with default index
    target_cols - column names for lags calculation
    id_cols - key columns to identify unique values
    date_col - column with datetime format values
    lags - lag values(days)
    windows - windows(days/weeks/months/etc.),
        calculation is performed within time range length of window
    preagg_methods - applied methods before rolling to make
        every value unique for given id_cols
    agg_methods - method of aggregation('mean', 'median', percentile, etc.)
    dynamic_filters - column names to use as filter
    ewm_params - span values(days) for each dynamic_filter
    '''

    data = data.sort_values(date_col)
    out_df = deepcopy(data)
    dates = [min(data[date_col]), max(data[date_col])]

    total = len(target_cols) * len(lags) * len(windows) * len(preagg_methods) * len(agg_methods) * len(dynamic_filters)
    progress = IntProgress(min=0, max=total)
    display(progress)

    for filter_col in dynamic_filters:
        group_col = [filter_col] + id_cols + [date_col]
        for preagg in preagg_methods:
          data_preag_filled = calc_preag_fill(data, group_col, date_col,
                                                  target_cols, preagg)

          ## add ewm features
          for alpha in ewm_params.get(filter_col, []):
              ewm_filled = calc_ewm(data_preag_filled, group_col,
                                    date_col, alpha)
              for lag in lags:
                ewm = shift(ewm_filled, group_col, date_col, lag)

                new_names = {x: "{0}_lag{1}d_alpha{2}_key{3}_preag{4}_{5}_dynamic_ewm".\
                    format(x, lag, alpha, '&'.join(id_cols), preagg, filter_col) for x in target_cols}

                out_df = pd.merge(out_df,
                                  ewm.rename(columns=new_names),
                                  how='left',
                                  on=group_col)

          ## add rolling features
          for w in windows:
              for method in agg_methods:
                  rolling_filled = calc_rolling(data_preag_filled,
                                                group_col, date_col,
                                                method, w)
                  for lag in lags:
                    ## lf_df - DataFrame with following columns: filter_col, id_cols, date_col, shifted rolling stats
                    rolling = shift(rolling_filled, group_col, date_col, lag)

                    method_name = method.__name__ if type(
                        method) != str else method

                    new_names = {x: "{0}_lag{1}d_w{2}_key{3}_preag{4}_ag{5}_{6}_dynamic_rolling".\
                                  format(x, lag, w, '&'.join(id_cols), preagg, method_name, filter_col) for x in target_cols}

                    out_df = pd.merge(out_df,
                                      rolling.rename(columns=new_names),
                                      how='left',
                                      on=group_col)
                    progress.value += 1

    return out_df

In [None]:
# use this step to generate lagges features
target_cols = ...
id_cols = ...
date_col = ...
built_in_funcs = [pd.Series.kurtosis, pd.Series.skew]

# flts = {'Promo': {'oprm':'>0', 'npromo':'==0', 'aprm':'>-1'}, 'weekday' : {'md':'==0', 'tue':'==1', 'wd':'==2', 'th':'==3', 'fr':'==4', 'sa':'==5', 'su':'==6', 'anyday':'>-1'}}


all_data['NoFilter'] = 1
all_data = generate_lagged_features(data_train
                    , target_cols = target_cols
                    , id_cols = id_cols
                    , date_col = date_col
                    , lags = [...] # min(lags)>= forecast horizon!
                    , windows = [...]
                    , preagg_methods = [...] # [''sum', 'mean', 'count']
                    , agg_methods = ['mean', 'median', percentile(10),  percentile(90)]
                    , dynamic_filters = [...]
                    , ewm_params={...}
                    )

In [63]:
#split train and test data
data_train = all_data[all_data['demand'].isnull() == False]
data_train

Unnamed: 0,product_rk,period_start_dt,store_location_rk,id,demand,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
15,40369,2016-12-19,317,15,50.0,0.0,500.000000,500.000000,1.0,2016,12,19
16,40370,2016-12-19,317,16,44.0,0.0,1000.000000,1000.000000,1.0,2016,12,19
17,40372,2016-12-19,317,17,13.0,0.0,2000.000000,2000.000000,1.0,2016,12,19
18,40373,2016-12-19,317,18,6.0,0.0,3000.000000,3000.000000,1.0,2016,12,19
19,46272,2016-12-19,317,19,34.0,0.0,157.000000,157.000000,1.0,2016,12,19
...,...,...,...,...,...,...,...,...,...,...,...,...
35309,40370,2019-11-25,1380,35507,24.0,0.0,1000.000000,1000.000000,1.0,2019,11,25
35310,40372,2019-11-25,1380,35508,11.0,0.0,2000.000000,2000.000000,1.0,2019,11,25
35311,40373,2019-11-25,1380,35509,3.0,0.0,3000.000000,3000.000000,1.0,2019,11,25
35312,46272,2019-11-25,1380,35510,0.0,1.0,284.290000,199.000000,1.0,2019,11,25


In [64]:
data_test = all_data[all_data['demand'].isnull()]
# rename "demand" to "predicted"
data_test.rename(columns={'demand': 'predicted'}, inplace=True)
data_test # 1200 samples



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,product_rk,period_start_dt,store_location_rk,id,predicted,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
908,40369,2019-12-02,317,908,,0.0,500.00,500.0,1.0,2019,12,2
909,40370,2019-12-02,317,909,,0.0,1000.00,1000.0,1.0,2019,12,2
910,40372,2019-12-02,317,910,,0.0,2000.00,2000.0,1.0,2019,12,2
911,40373,2019-12-02,317,911,,0.0,3000.00,3000.0,1.0,2019,12,2
912,46272,2019-12-02,317,912,,1.0,284.29,199.0,1.0,2019,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...
35339,40370,2019-12-30,1380,35537,,0.0,1000.00,1000.0,1.0,2019,12,30
35340,40372,2019-12-30,1380,35538,,0.0,2000.00,2000.0,1.0,2019,12,30
35341,40373,2019-12-30,1380,35539,,0.0,3000.00,3000.0,1.0,2019,12,30
35342,46272,2019-12-30,1380,35540,,1.0,284.29,199.0,1.0,2019,12,30


In [70]:
X = data_train.drop(['id', 'demand', 'period_start_dt'], axis=1)
y = data_train['demand']

In [71]:
X

Unnamed: 0,product_rk,store_location_rk,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
15,40369,317,0.0,500.000000,500.000000,1.0,2016,12,19
16,40370,317,0.0,1000.000000,1000.000000,1.0,2016,12,19
17,40372,317,0.0,2000.000000,2000.000000,1.0,2016,12,19
18,40373,317,0.0,3000.000000,3000.000000,1.0,2016,12,19
19,46272,317,0.0,157.000000,157.000000,1.0,2016,12,19
...,...,...,...,...,...,...,...,...,...
35309,40370,1380,0.0,1000.000000,1000.000000,1.0,2019,11,25
35310,40372,1380,0.0,2000.000000,2000.000000,1.0,2019,11,25
35311,40373,1380,0.0,3000.000000,3000.000000,1.0,2019,11,25
35312,46272,1380,1.0,284.290000,199.000000,1.0,2019,11,25


In [72]:
# answers in train period
y

Unnamed: 0,demand
15,50.0
16,44.0
17,13.0
18,6.0
19,34.0
...,...
35309,24.0
35310,11.0
35311,3.0
35312,0.0


In [73]:
# for training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [74]:
# train model
regressor = GradientBoostingRegressor(
    max_depth=11,
    n_estimators=420,
    learning_rate=0.2,
    random_state=1,
    min_samples_leaf=11,
    min_samples_split=2,
    loss='absolute_error'
)
regressor.fit(X_train, y_train)

In [75]:
y_pred = regressor.predict(X_test)
mean_absolute_error(y_test, y_pred)

4.911675066161127

In [78]:
# score sample from forecasting period
X_test = data_test.drop(['id', 'predicted', 'period_start_dt'], axis=1)
X_test

Unnamed: 0,product_rk,store_location_rk,PROMO1_FLAG,PRICE_REGULAR,PRICE_AFTER_DISC,AUTORIZATION_FLAG,ind_of_year,ind_of_month,ind_of_day
908,40369,317,0.0,500.00,500.0,1.0,2019,12,2
909,40370,317,0.0,1000.00,1000.0,1.0,2019,12,2
910,40372,317,0.0,2000.00,2000.0,1.0,2019,12,2
911,40373,317,0.0,3000.00,3000.0,1.0,2019,12,2
912,46272,317,1.0,284.29,199.0,1.0,2019,12,2
...,...,...,...,...,...,...,...,...,...
35339,40370,1380,0.0,1000.00,1000.0,1.0,2019,12,30
35340,40372,1380,0.0,2000.00,2000.0,1.0,2019,12,30
35341,40373,1380,0.0,3000.00,3000.0,1.0,2019,12,30
35342,46272,1380,1.0,284.29,199.0,1.0,2019,12,30


In [79]:
# look at forecsting values
y_pred_res = regressor.predict(X_test)
y_pred_res

array([ 4.9134842 ,  7.26839186,  5.96910099, ..., 20.79117113,
        7.33659533,  3.77091709])

# forecast postprocessing

In [80]:
# convert to df
y_results = data_test[['id', 'predicted']]
y_results['predicted'] = y_pred_res
y_results



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,id,predicted
908,908,4.913484
909,909,7.268392
910,910,5.969101
911,911,5.929214
912,912,3.415926
...,...,...
35339,35537,86.981169
35340,35538,39.054347
35341,35539,20.791171
35342,35540,7.336595


In [81]:
# change negative forecast to 0
y_results.loc[y_results['predicted'] < 0, ['predicted']]

Unnamed: 0,predicted
6449,-0.013221
6461,-0.257183
6473,-1.126443
9220,-0.417462
22144,-0.029753
22156,-0.438386
24916,-0.081803
31373,-1.43714
31379,-0.684439
34993,-0.055577


In [82]:
y_results.loc[y_results['predicted'] < 0, ['predicted']] = 0
y_results.loc[y_results['predicted'] < 0, ['predicted']]

Unnamed: 0,predicted


In [83]:
#prepare output csv-file:
y_results.to_csv('./submission_example.csv',sep=',', encoding='utf-8', index=False)