In [1]:
import pandas as pd
import numpy as np
import datetime
from autocorrections_v2 import *

# Load data

In [2]:
DPS_PRODUCT = pd.read_csv('data/DPS_PRODUCT.csv')
DPS_LOCATION = pd.read_csv('data/DPS_LOCATION.csv')
DPS_CUSTOMER = pd.read_csv('data/DPS_CUSTOMER.csv')
DPS_DISTR_CHANNEL = pd.read_csv('data/DPS_DISTR_CHANNEL.csv')
DPS_PROMO = pd.read_csv('data/DPS_PROMO.csv')

DPS_PRODUCT.columns = DPS_PRODUCT.columns.str.lower()
DPS_LOCATION.columns = DPS_LOCATION.columns.str.lower()
DPS_CUSTOMER.columns = DPS_CUSTOMER.columns.str.lower()
DPS_DISTR_CHANNEL.columns = DPS_DISTR_CHANNEL.columns.str.lower()
DPS_PROMO.columns  = DPS_PROMO.columns.str.lower()

In [34]:
# cut the dataset

In [3]:
DPS_PRODUCT = DPS_PRODUCT[DPS_PRODUCT['product_id'].isin(DPS_PRODUCT['product_id'].unique()[:15])]
DPS_LOCATION = DPS_LOCATION[DPS_LOCATION['location_id'].isin(DPS_LOCATION['location_id'].unique()[:5])]
DPS_CUSTOMER = DPS_CUSTOMER[DPS_CUSTOMER['customer_id'].isin(DPS_CUSTOMER['customer_id'].unique()[:5])]

In [4]:
IN_DATA = {
    'product' : DPS_PRODUCT,
    'location' : DPS_LOCATION,
    'customer' : DPS_CUSTOMER,
    'distr_channel' : DPS_DISTR_CHANNEL, 
    'promo': DPS_PROMO
}

In [3]:
# Set config parameters

In [5]:
CONFIG_PARAMETERS = {
    'ib_adj_forecast_list' : 'ACC_AGG_HYBRID_FORECAST_',
    'ib_npf_max_hist_depth' : 28,
    'ib_adj2_min_observ_num' : 7,
    'ib_adj2_base_past_period' : 56,
    'ib_adj3_base_past_period' : 56,
    'ib_adj3_seasonl_calc_lvl' : 7, 
    'ib_adj3_use_seas_coef_flg' : 1,
    'ib_adj3_min_observ_num' : 4, # 
    'ib_adj3_correction_method' : 'mean', # alternative is 'bound'
    'ibn_ff_active_status_list' : 'active'
}

In [6]:
CONFIG_FILE = {
    'tgt_type' : 'POS',
    'tgt_qty_table' : 'IN_SALES',
    'value_src' : 'SALES_QTY',
    'act_flag' : 1,
    'dr_scen' : 0, 
    'link_with_stock' : 1,
    'link_with_promo' : 1,
    'link_with_price' : 1,
    'vf_product_lvl' : 1,
    'vf_location_lvl' : 1,
    'vf_customer_lvl' : 1,
    'vf_distr_channel_lvl' : 1,
    'vf_time_lvl' : 'WEEK.2',
    'ml_product_lvl' : 7,
    'ml_location_lvl' : 5,
    'ml_customer_lvl' : 3,
    'ml_distr_channel_lvl' : 3,
    'ml_time_lvl' : 'WEEK.2',
    'out_product_lvl' : 7, # aggregation level 
    'out_location_lvl' : 5, # aggregation level
    'out_customer_lvl' : 5, # aggregation level
    'out_distr_channel_lvl' : 1, # aggregation level
    'out_time_lvl' : 'WEEK.2'
}

In [7]:
INITIAL_GLOBAL_FILE = {'IB_HIST_START_DT': datetime.datetime(2022, 6, 15), 
                      'IB_HIST_END_DT': datetime.datetime(2022, 7, 25),
                      'IB_FCST_HORIZON': datetime.timedelta(days = 30)
                      }

# Generate input data

In [8]:
def generate_ACC_AGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE):
    # Generates input data: forecast at 'out_..._lvl' aggregation level

    
    # set historical + forecast dates
    # for weekly frequency report start of the week (monday)
    freq = (CONFIG_FILE['out_time_lvl'][0] + '-MON') if (CONFIG_FILE['out_time_lvl'] == 'WEEK.2') else CONFIG_FILE['out_time_lvl'][0]
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'] - pd.Timedelta(days=INITIAL_GLOBAL_FILE['IB_HIST_START_DT'].weekday()), INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'], freq = freq)

    # create df from IN_DATA values, disaggregated at out_..._lvl
    ACC_AGG_HYBRID_FORECAST_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_lvl_id{CONFIG_FILE[f'out_{key}_lvl']}"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        ACC_AGG_HYBRID_FORECAST_ = pd.merge(ACC_AGG_HYBRID_FORECAST_, keys_df, 'cross')

    # fill unknown data with random numbers
    ACC_AGG_HYBRID_FORECAST_['segment_name'] = np.random.choice([1, 2, 3, 4], ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['vf_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['demand_type'] = np.random.choice(['promo', 'regular'], ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['ml_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['hybrid_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['ensemble_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['forecast_source'] = np.random.choice(['vf', 'ml'], ACC_AGG_HYBRID_FORECAST_.shape[0])

    return ACC_AGG_HYBRID_FORECAST_

In [9]:
ACC_AGG_HYBRID_FORECAST_ = generate_ACC_AGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE)

In [10]:
ACC_AGG_HYBRID_FORECAST_ 

Unnamed: 0,period_dt,product_lvl_id7,location_lvl_id5,customer_lvl_id5,distr_channel_lvl_id1,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source
0,2022-06-13,70001,500015,5000001,2,3,170.240357,regular,140.009609,100.615949,140.914329,vf
1,2022-06-13,70001,500015,5000002,2,1,115.326990,regular,131.421797,192.579379,141.353935,vf
2,2022-06-13,70001,500014,5000001,2,2,171.310643,promo,127.919069,173.169824,189.827424,vf
3,2022-06-13,70001,500014,5000002,2,3,123.879588,regular,144.777318,179.639310,133.806674,vf
4,2022-06-13,70002,500015,5000001,2,3,129.096090,regular,195.753672,124.843907,195.327195,ml
...,...,...,...,...,...,...,...,...,...,...,...,...
655,2022-08-22,70014,500014,5000002,2,4,165.138810,regular,187.799150,178.714782,170.220694,ml
656,2022-08-22,70015,500015,5000001,2,4,196.345499,promo,198.668918,116.197390,112.154234,ml
657,2022-08-22,70015,500015,5000002,2,3,161.365540,promo,198.212612,116.170615,106.941207,ml
658,2022-08-22,70015,500014,5000001,2,2,189.863608,promo,148.814589,136.408234,123.836471,vf


In [11]:
def generate_DISACC_DISAGG_HYBRID_FORECAST_(IN_DATA, 
                                            CONFIG_FILE,
                                            INITIAL_GLOBAL_FILE):

    # Generates input data: forecasts at lowest hierarchy level
    
    # set historical + forecast dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    DISACC_DISAGG_HYBRID_FORECAST_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        DISACC_DISAGG_HYBRID_FORECAST_ = pd.merge(DISACC_DISAGG_HYBRID_FORECAST_, keys_df, 'cross')

    # fill unknown data with random numbers
    DISACC_DISAGG_HYBRID_FORECAST_['segment_name'] = np.random.choice([1, 2, 3, 4], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['vf_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['demand_type'] = np.random.choice(['promo', 'regular'], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['ml_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['hybrid_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['ensemble_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['forecast_source'] = np.random.choice(['vf', 'ml'], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])

    return DISACC_DISAGG_HYBRID_FORECAST_

In [12]:
DISACC_DISAGG_HYBRID_FORECAST_ = generate_DISACC_DISAGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE)

In [13]:
DISACC_DISAGG_HYBRID_FORECAST_ 

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source
0,2022-06-15,80001,600002,6000002,1,1,107.028371,regular,115.855104,114.261617,157.213422,ml
1,2022-06-15,80001,600002,6000003,1,1,142.789347,regular,108.938160,183.415684,144.980236,ml
2,2022-06-15,80001,600002,6000004,1,2,198.256219,regular,122.816058,176.290248,198.991702,vf
3,2022-06-15,80001,600002,6000005,1,4,176.296950,promo,186.315259,183.587174,144.205040,vf
4,2022-06-15,80001,600002,6000006,1,4,136.648824,regular,120.316437,172.644250,111.683481,vf
...,...,...,...,...,...,...,...,...,...,...,...,...
26620,2022-08-24,80015,600006,6000002,1,1,137.446809,regular,112.966504,130.189957,167.336556,vf
26621,2022-08-24,80015,600006,6000003,1,3,111.396828,promo,193.549945,130.273614,114.609558,ml
26622,2022-08-24,80015,600006,6000004,1,3,113.540506,regular,161.780300,115.607994,123.605435,vf
26623,2022-08-24,80015,600006,6000005,1,1,108.394220,regular,107.016431,143.597497,145.040117,vf


In [14]:
def generate_DEMAND_RESTORED_(IN_DATA, 
                              CONFIG_FILE,
                              INITIAL_GLOBAL_FILE):
    
    # Generates input data: demand info
    
    # set historical dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'] , INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    DEMAND_RESTORED_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel', 'promo']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        DEMAND_RESTORED_ = pd.merge(DEMAND_RESTORED_, keys_df, 'cross')

    # fill unknown data with random numbers
    DEMAND_RESTORED_['tgt_qty_r'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['promo_flg'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['tgt_qty'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['stock_qty'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['deficit_flg1'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['deficit_flg2'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    
    return DEMAND_RESTORED_

In [15]:
DEMAND_RESTORED_ = generate_DEMAND_RESTORED_(IN_DATA, 
                              CONFIG_FILE,
                              INITIAL_GLOBAL_FILE)

In [16]:
DEMAND_RESTORED_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,promo_id,tgt_qty_r,promo_flg,tgt_qty,stock_qty,deficit_flg1,deficit_flg2
0,2022-06-15,80001,600002,6000002,1,1,141.521769,1,127.308112,133.760130,1,1
1,2022-06-15,80001,600002,6000003,1,1,191.223245,0,134.879453,178.710192,0,0
2,2022-06-15,80001,600002,6000004,1,1,166.317866,0,135.449944,113.563299,1,1
3,2022-06-15,80001,600002,6000005,1,1,100.588016,0,149.389634,118.303733,1,1
4,2022-06-15,80001,600002,6000006,1,1,129.237586,1,169.126406,139.552503,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15370,2022-07-25,80015,600006,6000002,1,1,120.792428,0,192.646577,104.433597,1,0
15371,2022-07-25,80015,600006,6000003,1,1,124.348328,0,157.634691,166.945781,1,1
15372,2022-07-25,80015,600006,6000004,1,1,198.106172,1,104.297640,199.184297,0,0
15373,2022-07-25,80015,600006,6000005,1,1,129.694162,0,100.592280,173.348315,1,0


In [17]:
def generate_FORECAST_FLAG_(IN_DATA, 
                   CONFIG_FILE,
                   INITIAL_GLOBAL_FILE):

    # Generates input data: flags whether given unit is active and should be used in forecasting or not
    
    # set historical and forecast dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    FORECAST_FLAG_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        FORECAST_FLAG_ = pd.merge(FORECAST_FLAG_, keys_df, 'cross')

    FORECAST_FLAG_['period_start_dt'] = INITIAL_GLOBAL_FILE['IB_HIST_START_DT']
    FORECAST_FLAG_['period_end_dt'] = INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON']

    # set status ['active', 'blocked', 'out-of-sale'] for each data unit. 
    # Assume that for the forecasting period last historical status is preserved

    timerange_hist = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])
    FORECAST_FLAG_HIST = pd.DataFrame(timerange_hist, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        FORECAST_FLAG_HIST = pd.merge(FORECAST_FLAG_HIST, keys_df, 'cross')

    FORECAST_FLAG_HIST['status'] = np.random.choice(['active', 'blocked', 'out-of-sale'], p = [0.5, 0.25, 0.25], size = FORECAST_FLAG_HIST.shape[0])
    FORECAST_FLAG_ =  pd.merge(FORECAST_FLAG_, FORECAST_FLAG_HIST, how = 'outer')

    last_vals = FORECAST_FLAG_.groupby(by = ['product_id', 'location_id', 'customer_id', 'distr_channel_id']).last()['status'].reset_index()

    missing = FORECAST_FLAG_[FORECAST_FLAG_['status'].isna()].drop(columns = 'status')
    missing = pd.merge(missing, last_vals, how = 'outer')
    
    FORECAST_FLAG_ = pd.concat([FORECAST_FLAG_[~FORECAST_FLAG_['status'].isna()], missing], ignore_index = True)
    FORECAST_FLAG_.sort_values(['period_dt', 'product_id', 'location_id', 'customer_id', 'distr_channel_id'])
    
    return FORECAST_FLAG_

In [18]:
FORECAST_FLAG_ = generate_FORECAST_FLAG_(IN_DATA, CONFIG_FILE, INITIAL_GLOBAL_FILE)

In [19]:
FORECAST_FLAG_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,period_start_dt,period_end_dt,status
0,2022-06-15,80001,600002,6000002,1,2022-06-15,2022-08-24,active
1,2022-06-15,80001,600002,6000003,1,2022-06-15,2022-08-24,active
2,2022-06-15,80001,600002,6000004,1,2022-06-15,2022-08-24,active
3,2022-06-15,80001,600002,6000005,1,2022-06-15,2022-08-24,active
4,2022-06-15,80001,600002,6000006,1,2022-06-15,2022-08-24,blocked
...,...,...,...,...,...,...,...,...
26620,2022-08-20,80015,600006,6000006,1,2022-06-15,2022-08-24,active
26621,2022-08-21,80015,600006,6000006,1,2022-06-15,2022-08-24,active
26622,2022-08-22,80015,600006,6000006,1,2022-06-15,2022-08-24,active
26623,2022-08-23,80015,600006,6000006,1,2022-06-15,2022-08-24,active


In [20]:
def generate_PRE_ABT_(IN_DATA, 
                      CONFIG_FILE,
                      INITIAL_GLOBAL_FILE):

    # Generate input data: data used in forecasting
    
    # set historical dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    PRE_ABT_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel', 'promo']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        PRE_ABT_ = pd.merge(PRE_ABT_, keys_df, 'cross')

    # fill unknown data with random numbers
    PRE_ABT_['tgt_qty_r'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['tgt_qty'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['stock_qty'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['deficit_flg1'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['deficit_flg2'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['promo_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['num_autorization'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['price_promo'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['promo_type1_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['promo_type2_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['price_act'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['price_reg'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])    

    return PRE_ABT_

In [21]:
PRE_ABT_ = generate_PRE_ABT_(IN_DATA, CONFIG_FILE, INITIAL_GLOBAL_FILE)

In [22]:
PRE_ABT_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,promo_id,tgt_qty_r,tgt_qty,stock_qty,deficit_flg1,deficit_flg2,promo_flg,num_autorization,price_promo,promo_type1_flg,promo_type2_flg,price_act,price_reg
0,2022-06-15,80001,600002,6000002,1,1,114.991352,116.131187,138.456738,1,0,0,0,199.190978,1,1,152.602403,145.165022
1,2022-06-15,80001,600002,6000003,1,1,198.550884,188.162176,171.250225,1,1,1,1,125.893670,0,1,178.102574,111.323175
2,2022-06-15,80001,600002,6000004,1,1,112.646414,103.946537,175.874261,1,1,1,1,166.144588,1,1,156.240319,111.449889
3,2022-06-15,80001,600002,6000005,1,1,140.277804,150.083741,141.515477,0,1,0,0,120.314307,0,0,145.076284,136.279970
4,2022-06-15,80001,600002,6000006,1,1,136.158326,159.396182,143.005519,0,1,1,1,181.990724,0,1,132.163843,121.961294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15370,2022-07-25,80015,600006,6000002,1,1,182.134761,129.830240,133.085220,1,1,0,0,174.239415,1,0,124.436728,158.698466
15371,2022-07-25,80015,600006,6000003,1,1,179.556882,107.610976,126.356345,0,0,1,0,108.749769,0,1,110.018879,119.117903
15372,2022-07-25,80015,600006,6000004,1,1,188.982721,156.781311,191.504870,1,0,1,1,197.915397,0,0,144.667641,157.444265
15373,2022-07-25,80015,600006,6000005,1,1,137.389436,155.900817,189.215551,1,1,0,0,130.521926,0,1,119.709115,160.514854


In [23]:
ARLEY_CRIT = pd.DataFrame({
    'cnt_observations_lbound': [0, 4, 6, 8, 10, 12, 14, 16],
    'cnt_observations_ubound': [3, 5, 7, 9, 11, 13, 15, 999999],
    'k_arley_001': [1.60, 2.15, 2.30, 2.40, 2.44, 2.46, 2.46, 2.46],
    'k_arley_005': [1.50, 1.85, 1.90, 1.92, 1.92, 1.92, 1.92, 1.92]})

In [24]:
ARLEY_CRIT

Unnamed: 0,cnt_observations_lbound,cnt_observations_ubound,k_arley_001,k_arley_005
0,0,3,1.6,1.5
1,4,5,2.15,1.85
2,6,7,2.3,1.9
3,8,9,2.4,1.92
4,10,11,2.44,1.92
5,12,13,2.46,1.92
6,14,15,2.46,1.92
7,16,999999,2.46,1.92


# Type 1
## delete forecast for inactive period and mark forecast with missing valuesfor in-active period of assortment lifecycle 

Algorithm description:\
Input: DISACC_DISAGG_HYBRID_FORECAST_, FORECAST_FLAG_\
Algorithm description: 
- corr1: flag forecasts that should not be forecasted (not active based on FORECAST_FLAG_) (app)
- corr2: flag forecasts with missing values
Output: DISACC_DISAGG_HYBRID_FORECAST_ with flags for corr1, corr2

In [25]:
# introduce noise into DISACC_DISAGG_HYBRID_FORECAST_
mask_1 = DISACC_DISAGG_HYBRID_FORECAST_['period_dt'] > INITIAL_GLOBAL_FILE['IB_HIST_END_DT']
mask_2 = np.random.choice([True, False], p = [0.01, 0.99], size = DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
mask = mask_1 & mask_2
DISACC_DISAGG_HYBRID_FORECAST_.loc[mask, 'hybrid_forecast_value'] = np.nan

In [60]:
EX_1_pre = DISACC_DISAGG_HYBRID_FORECAST_.iloc[np.where(mask == True)[0][7]-3 : np.where(mask == True)[0][7]+3, :]
EX_1_pre.loc[:, ['period_dt', 'product_id', 'location_id', 'customer_id', 'distr_channel_id', 'demand_type', 'hybrid_forecast_value']]

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,demand_type,hybrid_forecast_value
16141,2022-07-28,80001,600005,6000003,1,promo,128.146789
16142,2022-07-28,80001,600005,6000004,1,regular,179.728399
16143,2022-07-28,80001,600005,6000005,1,promo,171.738626
16144,2022-07-28,80001,600005,6000006,1,promo,
16145,2022-07-28,80001,600006,6000002,1,promo,163.744231
16146,2022-07-28,80001,600006,6000003,1,promo,191.234615


In [29]:
T1 = autocorrections_type1(DISACC_DISAGG_HYBRID_FORECAST_, FORECAST_FLAG_)

In [66]:
# check flags
EX_1_post = T1.iloc[np.where(mask == True)[0][7]-3 : np.where(mask == True)[0][7]+3, :]
EX_1_post.loc[:, ['period_dt', 'product_id', 'location_id', 'customer_id', 'distr_channel_id', 'demand_type', 'hybrid_forecast_value', 'status', 'flg_apply_corr1', 'flg_apply_corr2']]

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,demand_type,hybrid_forecast_value,status,flg_apply_corr1,flg_apply_corr2
16141,2022-07-28,80001,600005,6000003,1,promo,128.146789,active,0,0
16142,2022-07-28,80001,600005,6000004,1,regular,179.728399,active,0,0
16143,2022-07-28,80001,600005,6000005,1,promo,171.738626,active,0,0
16144,2022-07-28,80001,600005,6000006,1,promo,,active,0,1
16145,2022-07-28,80001,600006,6000002,1,promo,163.744231,blocked,1,0
16146,2022-07-28,80001,600006,6000003,1,promo,191.234615,out-of-sale,1,0


# Type 2
## fill missing forecast values

Algorithm description:\
Input: output from autocorrections_1 (DISACC_DISAGG_HYBRID_FORECAST_ flagged on step 1)\
Algorithm description: 
- for active forecast instances
- differentiate over regular and promo
- for units with enough recent forecast values (based on CONFIG_PARAMETERS['ib_adj2_min_observ_num'])
- substitute missing forecasts with average forecast over last on CONFIG_PARAMETERS['ib_npf_max_hist_depth']
- if not enough recent forecast values, go up the hierarchy
- first collect recent forecasts on same unit but in all distribution channels
- if still not enough values ommit customer and then location
Output: DISACC_DISAGG_HYBRID_FORECAST_ with 2 flags for correction and missing forecasts replaced (if possible)

In [72]:
EX_2_pre = EX_1_post
EX_2_pre.loc[:, ['period_dt', 'product_id', 'location_id', 'customer_id', 'demand_type', 'hybrid_forecast_value', 'status', 'flg_apply_corr1', 'flg_apply_corr2']]

Unnamed: 0,period_dt,product_id,location_id,customer_id,demand_type,hybrid_forecast_value,status,flg_apply_corr1,flg_apply_corr2
16141,2022-07-28,80001,600005,6000003,promo,128.146789,active,0,0
16142,2022-07-28,80001,600005,6000004,regular,179.728399,active,0,0
16143,2022-07-28,80001,600005,6000005,promo,171.738626,active,0,0
16144,2022-07-28,80001,600005,6000006,promo,,active,0,1
16145,2022-07-28,80001,600006,6000002,promo,163.744231,blocked,1,0
16146,2022-07-28,80001,600006,6000003,promo,191.234615,out-of-sale,1,0


In [68]:
T2 = autocorrections_type2(T1, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE, 
          INITIAL_GLOBAL_FILE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if part_1.shape[0] > 0: part_1['hybrid_forecast_value_aft2'] = part_1.apply(own_average, axis=1,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if part_2.shape[0] > 0: part_2['hybrid_forecast_value_aft2'] = part_2.apply(group_average, axis=1,


In [73]:
EX_2_post = T2.iloc[np.where(mask == True)[0][7]-3 : np.where(mask == True)[0][7]+3, :]
EX_2_post.loc[:, ['period_dt', 'product_id', 'location_id', 'customer_id', 'demand_type', 'hybrid_forecast_value', 'status', 'flg_apply_corr1', 'flg_apply_corr2', 'hybrid_forecast_value_aft2']]

Unnamed: 0,period_dt,product_id,location_id,customer_id,demand_type,hybrid_forecast_value,status,flg_apply_corr1,flg_apply_corr2,hybrid_forecast_value_aft2
16141,2022-07-28,80001,600005,6000003,promo,128.146789,active,0,0,128.146789
16142,2022-07-28,80001,600005,6000004,regular,179.728399,active,0,0,179.728399
16143,2022-07-28,80001,600005,6000005,promo,171.738626,active,0,0,171.738626
16144,2022-07-28,80001,600005,6000006,promo,,active,0,1,161.742105
16145,2022-07-28,80001,600006,6000002,promo,163.744231,blocked,1,0,
16146,2022-07-28,80001,600006,6000003,promo,191.234615,out-of-sale,1,0,


# Type 3

# Treat outliers

Algorithm description:\
Input: output from autocorrections_2 (DISACC_DISAGG_HYBRID_FORECAST_ with missing forecasts filled on step_2\
Algorithm description: 
- for active forecast instances with regular demand and previously not corrected by step 2,
- for units with enough historical values (based on CONFIG_PARAMETERS['ib_adj3_min_observ_num'])
- check whether forecast is an outlier based on Atley criterion for the historical values for the unit
- if forecast is an outlier then replace it accoarding to CONFIG_PARAMETERS['ib_adj3_correction_method'] (mean or bound)
- if 'mean': replace with mean historical value
- if 'bound' replace with Arley criterion corresponding bound (upper / lower)
Output: DISACC_DISAGG_HYBRID_FORECAST_ with 3 flags for correction and final forecast value after all corrections

In [75]:
# introduce outliers
indexes = T2[(T2['period_dt'] > INITIAL_GLOBAL_FILE['IB_HIST_END_DT']) & (T2['demand_type'] == 'regular') & (T2['flg_apply_corr1'] == 0) & (T2['flg_apply_corr2'] == 0)].index
T2.loc[indexes[321], 'hybrid_forecast_value_aft2'] = 10000
T2.loc[indexes[456], 'hybrid_forecast_value_aft2'] = 10000
T2.loc[indexes[789], 'hybrid_forecast_value_aft2'] = 10

In [89]:
EX_3_pre = T2.loc[[indexes[321], indexes[456], indexes[789]], :]
EX_3_pre.loc[:, ['period_dt', 'product_id', 'customer_id', 'demand_type', 'status', 'flg_apply_corr1', 'flg_apply_corr2', 'hybrid_forecast_value_aft2']]

Unnamed: 0,period_dt,product_id,customer_id,demand_type,status,flg_apply_corr1,flg_apply_corr2,hybrid_forecast_value_aft2
16608,2022-07-29,80005,6000004,regular,active,0,0,10000.0
17096,2022-07-30,80009,6000002,regular,active,0,0,10000.0
18353,2022-08-02,80015,6000004,regular,active,0,0,10.0


In [79]:
# first try substituting outliers with mean
CONFIG_PARAMETERS['ib_adj3_correction_method'] = 'mean'

In [80]:
T3_1 = autocorrections_type3(T2, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE, 
          INITIAL_GLOBAL_FILE, ARLEY_CRIT)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3['hybrid_forecast_value_aft2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3.apply(own_average_hist, axis=1,


In [90]:
EX_3_post_1 = T3_1.loc[[indexes[321], indexes[456], indexes[789]], :]
EX_3_post_1.loc[:, ['period_dt', 'product_id', 'demand_type', 'status', 'flg_apply_corr1', 'flg_apply_corr2', 'flg_apply_corr3', 'hybrid_forecast_value_aft2', 'hybrid_forecast_value_aft3']]

Unnamed: 0,period_dt,product_id,demand_type,status,flg_apply_corr1,flg_apply_corr2,flg_apply_corr3,hybrid_forecast_value_aft2,hybrid_forecast_value_aft3
16608,2022-07-29,80005,regular,active,0,0,1,10000.0,145.463201
17096,2022-07-30,80009,regular,active,0,0,1,10000.0,160.175481
18353,2022-08-02,80015,regular,active,0,0,1,10.0,149.032762


In [83]:
# now try substituting outliers with bounds
CONFIG_PARAMETERS['ib_adj3_correction_method'] = 'bound'

In [84]:
T3_2 = autocorrections_type3(T2, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE, 
          INITIAL_GLOBAL_FILE, ARLEY_CRIT)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3['hybrid_forecast_value_aft2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  T3['hybrid_forecast_value_aft3'] = T3.apply(own_average_hist, axis=1,


In [91]:
EX_3_post_2 = T3_2.loc[[indexes[321], indexes[456], indexes[789]], :]
EX_3_post_2.loc[:, ['period_dt', 'product_id', 'demand_type', 'status', 'flg_apply_corr1', 'flg_apply_corr2', 'flg_apply_corr3', 'hybrid_forecast_value_aft2', 'hybrid_forecast_value_aft3']]

Unnamed: 0,period_dt,product_id,demand_type,status,flg_apply_corr1,flg_apply_corr2,flg_apply_corr3,hybrid_forecast_value_aft2,hybrid_forecast_value_aft3
16608,2022-07-29,80005,regular,active,0,0,1,10000.0,195.659049
17096,2022-07-30,80009,regular,active,0,0,1,10000.0,210.309759
18353,2022-08-02,80015,regular,active,0,0,1,10.0,103.785537
