In [4]:
import pandas as pd
import numpy as np
import datetime
from autocorrections_v2 import *

# Load data

In [5]:
DPS_PRODUCT = pd.read_csv('data/DPS_PRODUCT.csv')
DPS_LOCATION = pd.read_csv('data/DPS_LOCATION.csv')
DPS_CUSTOMER = pd.read_csv('data/DPS_CUSTOMER.csv')
DPS_DISTR_CHANNEL = pd.read_csv('data/DPS_DISTR_CHANNEL.csv')
DPS_PROMO = pd.read_csv('data/DPS_PROMO.csv')

DPS_PRODUCT.columns = DPS_PRODUCT.columns.str.lower()
DPS_LOCATION.columns = DPS_LOCATION.columns.str.lower()
DPS_CUSTOMER.columns = DPS_CUSTOMER.columns.str.lower()
DPS_DISTR_CHANNEL.columns = DPS_DISTR_CHANNEL.columns.str.lower()
DPS_PROMO.columns  = DPS_PROMO.columns.str.lower()

In [34]:
# cut the dataset

In [7]:
DPS_PRODUCT = DPS_PRODUCT[DPS_PRODUCT['product_id'].isin(DPS_PRODUCT['product_id'].unique()[:30])]
DPS_LOCATION = DPS_LOCATION[DPS_LOCATION['location_id'].isin(DPS_LOCATION['location_id'].unique()[:10])]
DPS_CUSTOMER = DPS_CUSTOMER[DPS_CUSTOMER['customer_id'].isin(DPS_CUSTOMER['customer_id'].unique()[:10])]

In [8]:
IN_DATA = {
    'product' : DPS_PRODUCT,
    'location' : DPS_LOCATION,
    'customer' : DPS_CUSTOMER,
    'distr_channel' : DPS_DISTR_CHANNEL, 
    'promo': DPS_PROMO
}

In [3]:
# Set config parameters

In [9]:
CONFIG_PARAMETERS = {
    'ib_adj_forecast_list' : 'ACC_AGG_HYBRID_FORECAST_',
    'ib_npf_max_hist_depth' : 28,
    'ib_adj2_min_observ_num' : 7,
    'ib_adj2_base_past_period' : 56,
    'ib_adj3_base_past_period' : 56,
    'ib_adj3_seasonl_calc_lvl' : 7, 
    'ib_adj3_use_seas_coef_flg' : 1,
    'ib_adj3_min_observ_num' : 4, # 
    'ib_adj3_correction_method' : 'mean', # alternative is 'bound'
    'ibn_ff_active_status_list' : 'active'
}

In [10]:
CONFIG_FILE = {
    'tgt_type' : 'POS',
    'tgt_qty_table' : 'IN_SALES',
    'value_src' : 'SALES_QTY',
    'act_flag' : 1,
    'dr_scen' : 0, 
    'link_with_stock' : 1,
    'link_with_promo' : 1,
    'link_with_price' : 1,
    'vf_product_lvl' : 1,
    'vf_location_lvl' : 1,
    'vf_customer_lvl' : 1,
    'vf_distr_channel_lvl' : 1,
    'vf_time_lvl' : 'WEEK.2',
    'ml_product_lvl' : 7,
    'ml_location_lvl' : 5,
    'ml_customer_lvl' : 3,
    'ml_distr_channel_lvl' : 3,
    'ml_time_lvl' : 'WEEK.2',
    'out_product_lvl' : 7, # aggregation level 
    'out_location_lvl' : 5, # aggregation level
    'out_customer_lvl' : 5, # aggregation level
    'out_distr_channel_lvl' : 1, # aggregation level
    'out_time_lvl' : 'WEEK.2'
}

In [11]:
INITIAL_GLOBAL_FILE = {'IB_HIST_START_DT': datetime.datetime(2022, 6, 15), 
                      'IB_HIST_END_DT': datetime.datetime(2022, 7, 25),
                      'IB_FCST_HORIZON': datetime.timedelta(days = 30)
                      }

# Generate input data

In [12]:
def generate_ACC_AGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE):
    # Generates input data: forecast at 'out_..._lvl' aggregation level

    
    # set historical + forecast dates
    # for weekly frequency report start of the week (monday)
    freq = (CONFIG_FILE['out_time_lvl'][0] + '-MON') if (CONFIG_FILE['out_time_lvl'] == 'WEEK.2') else CONFIG_FILE['out_time_lvl'][0]
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'] - pd.Timedelta(days=INITIAL_GLOBAL_FILE['IB_HIST_START_DT'].weekday()), INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'], freq = freq)

    # create df from IN_DATA values, disaggregated at out_..._lvl
    ACC_AGG_HYBRID_FORECAST_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_lvl_id{CONFIG_FILE[f'out_{key}_lvl']}"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        ACC_AGG_HYBRID_FORECAST_ = pd.merge(ACC_AGG_HYBRID_FORECAST_, keys_df, 'cross')

    # fill unknown data with random numbers
    ACC_AGG_HYBRID_FORECAST_['segment_name'] = np.random.choice([1, 2, 3, 4], ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['vf_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['demand_type'] = np.random.choice(['promo', 'regular'], ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['ml_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['hybrid_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['ensemble_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['forecast_source'] = np.random.choice(['vf', 'ml'], ACC_AGG_HYBRID_FORECAST_.shape[0])

    return ACC_AGG_HYBRID_FORECAST_

In [13]:
ACC_AGG_HYBRID_FORECAST_ = generate_ACC_AGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE)

In [14]:
ACC_AGG_HYBRID_FORECAST_ 

Unnamed: 0,period_dt,product_lvl_id7,location_lvl_id5,customer_lvl_id5,distr_channel_lvl_id1,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source
0,2022-06-13,70001,500015,5000001,2,4,102.458059,regular,186.718074,145.351768,192.492749,ml
1,2022-06-13,70001,500015,5000002,2,2,141.719982,regular,194.995630,111.292033,168.089068,ml
2,2022-06-13,70001,500014,5000001,2,2,184.869727,regular,181.209166,144.575167,133.934479,vf
3,2022-06-13,70001,500014,5000002,2,1,155.461380,promo,127.770660,141.587366,104.501860,ml
4,2022-06-13,70001,500016,5000001,2,2,100.810720,promo,170.022389,187.504355,113.574605,vf
...,...,...,...,...,...,...,...,...,...,...,...,...
3295,2022-08-22,70030,500016,5000002,2,1,199.334407,regular,158.582161,177.410315,109.397269,vf
3296,2022-08-22,70030,500013,5000001,2,1,148.575309,promo,164.748682,149.367517,117.725111,vf
3297,2022-08-22,70030,500013,5000002,2,4,144.789723,promo,187.351350,145.859116,143.854154,ml
3298,2022-08-22,70030,500017,5000001,2,2,161.290984,regular,116.281387,172.290694,131.118731,vf


In [15]:
def generate_DISACC_DISAGG_HYBRID_FORECAST_(IN_DATA, 
                                            CONFIG_FILE,
                                            INITIAL_GLOBAL_FILE):

    # Generates input data: forecasts at lowest hierarchy level
    
    # set historical + forecast dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    DISACC_DISAGG_HYBRID_FORECAST_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        DISACC_DISAGG_HYBRID_FORECAST_ = pd.merge(DISACC_DISAGG_HYBRID_FORECAST_, keys_df, 'cross')

    # fill unknown data with random numbers
    DISACC_DISAGG_HYBRID_FORECAST_['segment_name'] = np.random.choice([1, 2, 3, 4], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['vf_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['demand_type'] = np.random.choice(['promo', 'regular'], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['ml_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['hybrid_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['ensemble_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['forecast_source'] = np.random.choice(['vf', 'ml'], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])

    return DISACC_DISAGG_HYBRID_FORECAST_

In [16]:
DISACC_DISAGG_HYBRID_FORECAST_ = generate_DISACC_DISAGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE)

In [17]:
DISACC_DISAGG_HYBRID_FORECAST_ 

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source
0,2022-06-15,80001,600002,6000002,1,3,152.476098,promo,117.423570,186.457119,187.887594,ml
1,2022-06-15,80001,600002,6000003,1,3,112.886350,regular,142.000996,178.503435,190.030242,vf
2,2022-06-15,80001,600002,6000004,1,4,177.273130,regular,198.806421,156.454665,175.845675,vf
3,2022-06-15,80001,600002,6000005,1,2,163.864399,promo,121.285525,178.250802,175.332455,ml
4,2022-06-15,80001,600002,6000006,1,4,172.452359,promo,147.624937,113.559652,146.437813,ml
...,...,...,...,...,...,...,...,...,...,...,...,...
212995,2022-08-24,80030,600011,6000007,1,1,195.923065,regular,184.754840,178.428715,131.425287,vf
212996,2022-08-24,80030,600011,6000008,1,3,109.102565,promo,159.945862,125.527151,117.385653,vf
212997,2022-08-24,80030,600011,6000009,1,3,158.543612,promo,166.951086,187.107210,141.757817,vf
212998,2022-08-24,80030,600011,6000010,1,3,197.969182,promo,109.416262,152.236308,198.038032,vf


In [18]:
def generate_DEMAND_RESTORED_(IN_DATA, 
                              CONFIG_FILE,
                              INITIAL_GLOBAL_FILE):
    
    # Generates input data: demand info
    
    # set historical dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'] , INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    DEMAND_RESTORED_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel', 'promo']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        DEMAND_RESTORED_ = pd.merge(DEMAND_RESTORED_, keys_df, 'cross')

    # fill unknown data with random numbers
    DEMAND_RESTORED_['tgt_qty_r'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['promo_flg'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['tgt_qty'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['stock_qty'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['deficit_flg1'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['deficit_flg2'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    
    return DEMAND_RESTORED_

In [20]:
DEMAND_RESTORED_ = generate_DEMAND_RESTORED_(IN_DATA, 
                              CONFIG_FILE,
                              INITIAL_GLOBAL_FILE)

In [21]:
DEMAND_RESTORED_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,promo_id,tgt_qty_r,promo_flg,tgt_qty,stock_qty,deficit_flg1,deficit_flg2
0,2022-06-15,80001,600002,6000002,1,1,106.420906,0,182.103570,194.134455,1,0
1,2022-06-15,80001,600002,6000003,1,1,132.263829,0,142.156996,131.340955,0,1
2,2022-06-15,80001,600002,6000004,1,1,123.896546,1,166.946359,130.113615,1,1
3,2022-06-15,80001,600002,6000005,1,1,182.890484,1,144.572210,129.329930,0,1
4,2022-06-15,80001,600002,6000006,1,1,192.812091,0,160.806651,106.481714,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
122995,2022-07-25,80030,600011,6000007,1,1,146.056443,1,190.825720,148.817152,0,0
122996,2022-07-25,80030,600011,6000008,1,1,106.510431,1,165.833224,155.866606,1,0
122997,2022-07-25,80030,600011,6000009,1,1,152.594676,0,140.209868,106.769739,0,0
122998,2022-07-25,80030,600011,6000010,1,1,152.660042,0,136.876270,167.590649,1,1


In [22]:
def generate_FORECAST_FLAG_(IN_DATA, 
                   CONFIG_FILE,
                   INITIAL_GLOBAL_FILE):

    # Generates input data: flags whether given unit is active and should be used in forecasting or not
    
    # set historical and forecast dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    FORECAST_FLAG_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        FORECAST_FLAG_ = pd.merge(FORECAST_FLAG_, keys_df, 'cross')

    FORECAST_FLAG_['period_start_dt'] = INITIAL_GLOBAL_FILE['IB_HIST_START_DT']
    FORECAST_FLAG_['period_end_dt'] = INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON']

    # set status ['active', 'blocked', 'out-of-sale'] for each data unit. 
    # Assume that for the forecasting period last historical status is preserved

    timerange_hist = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])
    FORECAST_FLAG_HIST = pd.DataFrame(timerange_hist, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        FORECAST_FLAG_HIST = pd.merge(FORECAST_FLAG_HIST, keys_df, 'cross')

    FORECAST_FLAG_HIST['status'] = np.random.choice(['active', 'blocked', 'out-of-sale'], p = [0.5, 0.25, 0.25], size = FORECAST_FLAG_HIST.shape[0])
    FORECAST_FLAG_ =  pd.merge(FORECAST_FLAG_, FORECAST_FLAG_HIST, how = 'outer')

    last_vals = FORECAST_FLAG_.groupby(by = ['product_id', 'location_id', 'customer_id', 'distr_channel_id']).last()['status'].reset_index()

    missing = FORECAST_FLAG_[FORECAST_FLAG_['status'].isna()].drop(columns = 'status')
    missing = pd.merge(missing, last_vals, how = 'outer')
    
    FORECAST_FLAG_ = pd.concat([FORECAST_FLAG_[~FORECAST_FLAG_['status'].isna()], missing], ignore_index = True)
    FORECAST_FLAG_.sort_values(['period_dt', 'product_id', 'location_id', 'customer_id', 'distr_channel_id'])
    
    return FORECAST_FLAG_

In [23]:
FORECAST_FLAG_ = generate_FORECAST_FLAG_(IN_DATA, CONFIG_FILE, INITIAL_GLOBAL_FILE)

In [24]:
FORECAST_FLAG_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,period_start_dt,period_end_dt,status
0,2022-06-15,80001,600002,6000002,1,2022-06-15,2022-08-24,out-of-sale
1,2022-06-15,80001,600002,6000003,1,2022-06-15,2022-08-24,active
2,2022-06-15,80001,600002,6000004,1,2022-06-15,2022-08-24,blocked
3,2022-06-15,80001,600002,6000005,1,2022-06-15,2022-08-24,active
4,2022-06-15,80001,600002,6000006,1,2022-06-15,2022-08-24,blocked
...,...,...,...,...,...,...,...,...
212995,2022-08-20,80030,600011,6000011,1,2022-06-15,2022-08-24,active
212996,2022-08-21,80030,600011,6000011,1,2022-06-15,2022-08-24,active
212997,2022-08-22,80030,600011,6000011,1,2022-06-15,2022-08-24,active
212998,2022-08-23,80030,600011,6000011,1,2022-06-15,2022-08-24,active


In [25]:
def generate_PRE_ABT_(IN_DATA, 
                      CONFIG_FILE,
                      INITIAL_GLOBAL_FILE):

    # Generate input data: data used in forecasting
    
    # set historical dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    PRE_ABT_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel', 'promo']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        PRE_ABT_ = pd.merge(PRE_ABT_, keys_df, 'cross')

    # fill unknown data with random numbers
    PRE_ABT_['tgt_qty_r'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['tgt_qty'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['stock_qty'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['deficit_flg1'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['deficit_flg2'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['promo_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['num_autorization'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['price_promo'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['promo_type1_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['promo_type2_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['price_act'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['price_reg'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])    

    return PRE_ABT_

In [26]:
PRE_ABT_ = generate_PRE_ABT_(IN_DATA, CONFIG_FILE, INITIAL_GLOBAL_FILE)

In [27]:
PRE_ABT_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,promo_id,tgt_qty_r,tgt_qty,stock_qty,deficit_flg1,deficit_flg2,promo_flg,num_autorization,price_promo,promo_type1_flg,promo_type2_flg,price_act,price_reg
0,2022-06-15,80001,600002,6000002,1,1,146.689822,164.519232,110.272390,1,1,1,0,136.484037,0,1,113.062384,131.940579
1,2022-06-15,80001,600002,6000003,1,1,179.691067,163.830794,144.164348,1,1,0,0,137.296950,1,1,156.384995,130.754516
2,2022-06-15,80001,600002,6000004,1,1,186.342547,111.171153,183.166170,0,0,1,1,158.735205,0,0,177.753689,170.636835
3,2022-06-15,80001,600002,6000005,1,1,112.771324,138.257147,147.327570,0,0,1,1,138.192457,1,1,128.263770,190.859878
4,2022-06-15,80001,600002,6000006,1,1,147.135283,125.400794,143.730901,1,0,0,0,195.513245,0,0,197.323280,163.609684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122995,2022-07-25,80030,600011,6000007,1,1,196.300862,138.941993,198.211615,1,0,0,0,160.032373,0,0,180.206012,110.897332
122996,2022-07-25,80030,600011,6000008,1,1,193.587539,160.955568,190.347485,1,0,0,1,180.432277,1,0,125.363002,156.934700
122997,2022-07-25,80030,600011,6000009,1,1,120.650861,156.221367,154.031737,0,0,1,0,172.996398,0,0,188.567312,101.208859
122998,2022-07-25,80030,600011,6000010,1,1,108.722903,140.753343,122.900306,1,1,1,0,144.480285,0,0,177.115378,151.434705


In [28]:
ARLEY_CRIT = pd.DataFrame({
    'cnt_observations_lbound': [0, 4, 6, 8, 10, 12, 14, 16],
    'cnt_observations_ubound': [3, 5, 7, 9, 11, 13, 15, 999999],
    'k_arley_001': [1.60, 2.15, 2.30, 2.40, 2.44, 2.46, 2.46, 2.46],
    'k_arley_005': [1.50, 1.85, 1.90, 1.92, 1.92, 1.92, 1.92, 1.92]})

In [29]:
ARLEY_CRIT

Unnamed: 0,cnt_observations_lbound,cnt_observations_ubound,k_arley_001,k_arley_005
0,0,3,1.6,1.5
1,4,5,2.15,1.85
2,6,7,2.3,1.9
3,8,9,2.4,1.92
4,10,11,2.44,1.92
5,12,13,2.46,1.92
6,14,15,2.46,1.92
7,16,999999,2.46,1.92


# Type 1
## delete forecast for inactive period and mark forecast with missing valuesfor in-active period of assortment lifecycle 

Algorithm description:\
Input: DISACC_DISAGG_HYBRID_FORECAST_, FORECAST_FLAG_\
Algorithm description: 
- corr1: flag forecasts that should not be forecasted (not active based on FORECAST_FLAG_) (app)
- corr2: flag forecasts with missing values
Output: DISACC_DISAGG_HYBRID_FORECAST_ with flags for corr1, corr2

In [30]:
# introduce noise into DISACC_DISAGG_HYBRID_FORECAST_
mask_1 = DISACC_DISAGG_HYBRID_FORECAST_['period_dt'] > INITIAL_GLOBAL_FILE['IB_HIST_END_DT']
mask_2 = np.random.choice([True, False], p = [0.01, 0.99], size = DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
mask = mask_1 & mask_2
DISACC_DISAGG_HYBRID_FORECAST_.loc[mask, 'hybrid_forecast_value'] = np.nan

In [31]:
DISACC_DISAGG_HYBRID_FORECAST_.iloc[np.where(mask == True)[0][777]-3 : np.where(mask == True)[0][777]+3, :]

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source
202885,2022-08-21,80019,600010,6000007,1,3,158.141355,regular,175.541646,167.65716,132.203169,vf
202886,2022-08-21,80019,600010,6000008,1,4,107.20949,regular,113.399371,108.263916,120.569103,vf
202887,2022-08-21,80019,600010,6000009,1,2,118.28097,promo,170.189346,127.156324,139.830199,ml
202888,2022-08-21,80019,600010,6000010,1,2,105.494751,regular,131.133023,,120.83217,vf
202889,2022-08-21,80019,600010,6000011,1,3,190.185031,regular,128.995052,104.541915,153.813436,ml
202890,2022-08-21,80019,600011,6000002,1,3,150.755106,regular,175.048129,147.466466,121.828467,ml


In [32]:
T1 = autocorrections_type1(DISACC_DISAGG_HYBRID_FORECAST_, FORECAST_FLAG_)

In [34]:
# check flags
T1.iloc[np.where(mask == True)[0][0]-3 : np.where(mask == True)[0][0]+3, :]

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source,period_start_dt,period_end_dt,status,flg_apply_corr1,flg_apply_corr2
123197,2022-07-26,80002,600011,6000009,1,4,163.304012,promo,170.596661,175.667793,155.681,ml,2022-06-15,2022-08-24,out-of-sale,1,0
123198,2022-07-26,80002,600011,6000010,1,4,163.238434,regular,132.057695,147.755202,159.977495,ml,2022-06-15,2022-08-24,blocked,1,0
123199,2022-07-26,80002,600011,6000011,1,2,104.77772,regular,156.00218,170.339582,149.864265,vf,2022-06-15,2022-08-24,out-of-sale,1,0
123200,2022-07-26,80003,600002,6000002,1,4,115.958174,regular,135.259692,,123.916047,vf,2022-06-15,2022-08-24,active,0,1
123201,2022-07-26,80003,600002,6000003,1,4,176.829636,regular,154.716808,138.132341,116.524593,vf,2022-06-15,2022-08-24,active,0,0
123202,2022-07-26,80003,600002,6000004,1,2,105.495633,promo,187.608739,107.92644,147.91077,vf,2022-06-15,2022-08-24,active,0,0


# Type 2
## fill missing forecast values

Algorithm description:\
Input: output from autocorrections_1 (DISACC_DISAGG_HYBRID_FORECAST_ flagged on step 1)\
Algorithm description: 
- for active forecast instances
- differentiate over regular and promo
- for units with enough recent forecast values (based on CONFIG_PARAMETERS['ib_adj2_min_observ_num'])
- substitute missing forecasts with average forecast over last on CONFIG_PARAMETERS['ib_npf_max_hist_depth']
- if not enough recent forecast values, go up the hierarchy
- first collect recent forecasts on same unit but in all distribution channels
- if still not enough values ommit customer and then location
Output: DISACC_DISAGG_HYBRID_FORECAST_ with 2 flags for correction and missing forecasts replaced (if possible)

In [35]:
T2 = autocorrections_type2(T1, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE, 
          INITIAL_GLOBAL_FILE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if part_1.shape[0] > 0: part_1['hybrid_forecast_value_aft2'] = part_1.apply(own_average, axis=1,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if part_2.shape[0] > 0: part_2['hybrid_forecast_value_aft2'] = part_2.apply(group_average, axis=1,


In [36]:
T2.iloc[np.where(mask == True)[0][0]-3 : np.where(mask == True)[0][0]+3, [0, 1, 2, 3, 9, -4, -3, -2, -1]]

Unnamed: 0,period_dt,product_id,location_id,customer_id,hybrid_forecast_value,status,flg_apply_corr1,flg_apply_corr2,hybrid_forecast_value_aft2
123197,2022-07-26,80002,600011,6000009,175.667793,out-of-sale,1,0,
123198,2022-07-26,80002,600011,6000010,147.755202,blocked,1,0,
123199,2022-07-26,80002,600011,6000011,170.339582,out-of-sale,1,0,
123200,2022-07-26,80003,600002,6000002,,active,0,1,148.029108
123201,2022-07-26,80003,600002,6000003,138.132341,active,0,0,138.132341
123202,2022-07-26,80003,600002,6000004,107.92644,active,0,0,107.92644


# Type 3

# Treat outliers

Algorithm description:\
Input: output from autocorrections_2 (DISACC_DISAGG_HYBRID_FORECAST_ with missing forecasts filled on step_2\
Algorithm description: 
- for active forecast instances with regular demand and previously not corrected by step 2,
- for units with enough historical values (based on CONFIG_PARAMETERS['ib_adj3_min_observ_num'])
- check whether forecast is an outlier based on Atley criterion for the historical values for the unit
- if forecast is an outlier then replace it accoarding to CONFIG_PARAMETERS['ib_adj3_correction_method'] (mean or bound)
- if 'mean': replace with mean historical value
- if 'bound' replace with Arley criterion corresponding bound (upper / lower)
Output: DISACC_DISAGG_HYBRID_FORECAST_ with 3 flags for correction and final forecast value after all corrections

In [43]:
# introduce outliers
indexes = T2[(T2['period_dt'] > INITIAL_GLOBAL_FILE['IB_HIST_END_DT']) & (T2['demand_type'] == 'regular') & (T2['flg_apply_corr1'] == 0) & (T2['flg_apply_corr2'] == 0)].index
T2.loc[indexes[321], 'hybrid_forecast_value_aft2'] = 10000
T2.loc[indexes[456], 'hybrid_forecast_value_aft2'] = 10000
T2.loc[indexes[789], 'hybrid_forecast_value_aft2'] = 10

In [44]:
T2.loc[[indexes[321], indexes[456], indexes[789]], :]

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source,period_start_dt,period_end_dt,status,flg_apply_corr1,flg_apply_corr2,hybrid_forecast_value_aft2
124234,2022-07-26,80013,600005,6000006,1,1,183.091042,regular,165.368478,178.308993,154.15852,vf,2022-06-15,2022-08-24,active,0,0,10000.0
124804,2022-07-26,80019,600002,6000006,1,4,140.132675,regular,139.540227,152.007391,139.050593,ml,2022-06-15,2022-08-24,active,0,0,10000.0
126173,2022-07-27,80002,600009,6000005,1,3,176.466882,regular,131.656321,198.949731,186.295078,ml,2022-06-15,2022-08-24,active,0,0,10.0


In [47]:
# first try substituting outliers with mean
CONFIG_PARAMETERS['ib_adj3_correction_method'] = 'mean'

In [48]:
T3_1 = autocorrections_type3(T2, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE, 
          INITIAL_GLOBAL_FILE, ARLEY_CRIT)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3['hybrid_forecast_value_aft2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3.apply(own_average_hist, axis=1,


In [49]:
T3_1.loc[[indexes[321], indexes[456], indexes[789]], :]

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source,period_start_dt,period_end_dt,status,flg_apply_corr1,flg_apply_corr2,hybrid_forecast_value_aft2,hybrid_forecast_value_aft3,flg_apply_corr3
124234,2022-07-26,80013,600005,6000006,1,1,183.091042,regular,165.368478,178.308993,154.15852,vf,2022-06-15,2022-08-24,active,0,0,10000.0,145.873829,1
124804,2022-07-26,80019,600002,6000006,1,4,140.132675,regular,139.540227,152.007391,139.050593,ml,2022-06-15,2022-08-24,active,0,0,10000.0,131.021011,1
126173,2022-07-27,80002,600009,6000005,1,3,176.466882,regular,131.656321,198.949731,186.295078,ml,2022-06-15,2022-08-24,active,0,0,10.0,152.593005,1


In [50]:
# now try substituting outliers with bounds
CONFIG_PARAMETERS['ib_adj3_correction_method'] = 'bound'

In [51]:
T3_2 = autocorrections_type3(T2, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE, 
          INITIAL_GLOBAL_FILE, ARLEY_CRIT)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3['hybrid_forecast_value_aft2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3.apply(own_average_hist, axis=1,


In [52]:
T3_2.loc[[indexes[321], indexes[456], indexes[789]], :]

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source,period_start_dt,period_end_dt,status,flg_apply_corr1,flg_apply_corr2,hybrid_forecast_value_aft2,hybrid_forecast_value_aft3,flg_apply_corr3
124234,2022-07-26,80013,600005,6000006,1,1,183.091042,regular,165.368478,178.308993,154.15852,vf,2022-06-15,2022-08-24,active,0,0,10000.0,209.552105,1
124804,2022-07-26,80019,600002,6000006,1,4,140.132675,regular,139.540227,152.007391,139.050593,ml,2022-06-15,2022-08-24,active,0,0,10000.0,168.955001,1
126173,2022-07-27,80002,600009,6000005,1,3,176.466882,regular,131.656321,198.949731,186.295078,ml,2022-06-15,2022-08-24,active,0,0,10.0,96.845914,1
