In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
DPS_PRODUCT = pd.read_csv('data/DPS_PRODUCT.csv')
DPS_LOCATION = pd.read_csv('data/DPS_LOCATION.csv')
DPS_CUSTOMER = pd.read_csv('data/DPS_CUSTOMER.csv')
DPS_DISTR_CHANNEL = pd.read_csv('data/DPS_DISTR_CHANNEL.csv')
DPS_PROMO = pd.read_csv('data/DPS_PROMO.csv')

DPS_PRODUCT.columns = DPS_PRODUCT.columns.str.lower()
DPS_LOCATION.columns = DPS_LOCATION.columns.str.lower()
DPS_CUSTOMER.columns = DPS_CUSTOMER.columns.str.lower()
DPS_DISTR_CHANNEL.columns = DPS_DISTR_CHANNEL.columns.str.lower()
DPS_PROMO.columns  = DPS_PROMO.columns.str.lower()


In [4]:
IN_DATA = {
    'product' : DPS_PRODUCT,
    'location' : DPS_LOCATION,
    'customer' : DPS_CUSTOMER,
    'distr_channel' : DPS_DISTR_CHANNEL, 
    'promo': DPS_PROMO
}

In [5]:
for key in IN_DATA.keys():
   print(IN_DATA[key].columns)

Index(['product_lvl_id1', 'product_lvl_nm1', 'product_lvl_desc1',
       'product_lvl_id2', 'product_lvl_nm2', 'product_lvl_desc2',
       'product_lvl_id3', 'product_lvl_nm3', 'product_lvl_desc3',
       'product_lvl_id4', 'product_lvl_nm4', 'product_lvl_desc4',
       'product_lvl_id5', 'product_lvl_nm5', 'product_lvl_desc5',
       'product_lvl_id6', 'product_lvl_nm6', 'product_lvl_desc6',
       'product_lvl_id7', 'product_lvl_nm7', 'product_lvl_desc7',
       'parent_product_id', 'product_id', 'product_nm', 'product_desc',
       'modified_dttm', 'delete_flg'],
      dtype='object')
Index(['location_lvl_id1', 'location_lvl_nm1', 'location_lvl_desc1',
       'location_lvl_id2', 'location_lvl_nm2', 'location_lvl_desc2',
       'location_lvl_id3', 'location_lvl_nm3', 'location_lvl_desc3',
       'location_lvl_id4', 'location_lvl_nm4', 'location_lvl_desc4',
       'location_lvl_id5', 'location_lvl_nm5', 'location_lvl_desc5',
       'location_id', 'location_nm', 'location_desc', 'open_

In [3]:
# Set config parameters

In [6]:
CONFIG_PARAMETERS = {
    'ib_adj_forecast_list' : 'ACC_AGG_HYBRID_FORECAST_',
    'ib_npf_max_hist_depth' : 28,
    'ib_adj2_min_observ_num' : 7,
    'ib_adj2_base_past_period' : 56,
    'ib_adj3_base_past_period' : 56,
    'ib_adj3_seasonl_calc_lvl' : 7,  ####>????
    'ib_adj3_use_seas_coef_flg' : 1,
    'ib_adj3_min_observ_num' : 28,
    'ib_adj3_correction_method' : 'mean',
    'ibn_ff_active_status_list' : 'active'
}

In [7]:
CONFIG_FILE = {
    'tgt_type' : 'POS',
    'tgt_qty_table' : 'IN_SALES',
    'value_src' : 'SALES_QTY',
    'act_flag' : 1,
    'dr_scen' : 0, 
    'link_with_stock' : 1,
    'link_with_promo' : 1,
    'link_with_price' : 1,
    'vf_product_lvl' : 1,
    'vf_location_lvl' : 1,
    'vf_customer_lvl' : 1,
    'vf_distr_channel_lvl' : 1,
    'vf_time_lvl' : 'WEEK.2',
    'ml_product_lvl' : 7,
    'ml_location_lvl' : 5,
    'ml_customer_lvl' : 3,
    'ml_distr_channel_lvl' : 3,
    'ml_time_lvl' : 'WEEK.2',
    'out_product_lvl' : 7, # aggregation level 
    'out_location_lvl' : 5, # aggregation level
    'out_customer_lvl' : 5, # aggregation level
    'out_distr_channel_lvl' : 1, # aggregation level
    'out_time_lvl' : 'WEEK.2'
}

In [8]:
INITIAL_GLOBAL_FILE = {'IB_HIST_START_DT': datetime.datetime(2022, 6, 15), 
                      'IB_HIST_END_DT': datetime.datetime(2022, 7, 15),
                      'IB_FCST_HORIZON': datetime.timedelta(days = 30)
                      }

# Generate input data

In [284]:
def generate_ACC_AGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE):

    # set historical + forecast dates
    # for weekly frequency report start of the week (monday)
    freq = (CONFIG_FILE['out_time_lvl'][0] + '-MON') if (CONFIG_FILE['out_time_lvl'] == 'WEEK.2') else CONFIG_FILE['out_time_lvl'][0]
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'] - pd.Timedelta(days=INITIAL_GLOBAL_FILE['IB_HIST_START_DT'].weekday()), INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'], freq = freq)

    # create df from IN_DATA values, disaggregated at out_..._lvl
    ACC_AGG_HYBRID_FORECAST_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_lvl_id{CONFIG_FILE[f'out_{key}_lvl']}"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        ACC_AGG_HYBRID_FORECAST_ = pd.merge(ACC_AGG_HYBRID_FORECAST_, keys_df, 'cross')

    # fill unknown data with random numbers
    ACC_AGG_HYBRID_FORECAST_['segment_name'] = np.random.choice([1, 2, 3, 4], ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['vf_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['demand_type'] = np.random.choice(['promo', 'regular'], ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['ml_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['hybrid_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['ensemble_forecast_value'] = np.random.uniform(100, 200, ACC_AGG_HYBRID_FORECAST_.shape[0])
    ACC_AGG_HYBRID_FORECAST_['forecast_source'] = np.random.choice(['vf', 'ml'], ACC_AGG_HYBRID_FORECAST_.shape[0])

    return ACC_AGG_HYBRID_FORECAST_
    

In [103]:
ACC_AGG_HYBRID_FORECAST_ = generate_ACC_AGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE)

In [104]:
ACC_AGG_HYBRID_FORECAST_ 

Unnamed: 0,period_dt,product_lvl_id7,location_lvl_id5,customer_lvl_id5,distr_channel_lvl_id1,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source
0,2022-06-13,70001,500015,5000001,2,4,139.497435,promo,156.418317,118.262932,177.421156,vf
1,2022-06-13,70001,500015,5000002,2,1,139.710838,promo,137.940992,132.118891,114.932849,ml
2,2022-06-13,70001,500015,5000003,2,2,196.770993,regular,152.337898,170.330462,105.296409,ml
3,2022-06-13,70001,500014,5000001,2,1,151.311731,promo,123.205420,180.039925,163.239418,vf
4,2022-06-13,70001,500014,5000002,2,2,116.785341,regular,194.183207,182.264350,147.425164,ml
...,...,...,...,...,...,...,...,...,...,...,...,...
11200,2022-08-08,70083,500013,5000002,2,1,145.772450,promo,124.132219,197.287308,191.346941,vf
11201,2022-08-08,70083,500013,5000003,2,1,121.063450,regular,141.881534,100.976083,103.808728,ml
11202,2022-08-08,70083,500017,5000001,2,1,140.010240,promo,152.266090,127.366019,115.374646,ml
11203,2022-08-08,70083,500017,5000002,2,4,152.271634,promo,109.007695,109.041280,129.372657,vf


In [285]:
def generate_DISACC_DISAGG_HYBRID_FORECAST_(IN_DATA, 
                                            CONFIG_FILE,
                                            INITIAL_GLOBAL_FILE):
    # set historical + forecast dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    DISACC_DISAGG_HYBRID_FORECAST_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        DISACC_DISAGG_HYBRID_FORECAST_ = pd.merge(DISACC_DISAGG_HYBRID_FORECAST_, keys_df, 'cross')

    # fill unknown data with random numbers
    DISACC_DISAGG_HYBRID_FORECAST_['segment_name'] = np.random.choice([1, 2, 3, 4], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['vf_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['demand_type'] = np.random.choice(['promo', 'regular'], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['ml_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['hybrid_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['ensemble_forecast_value'] = np.random.uniform(100, 200, DISACC_DISAGG_HYBRID_FORECAST_.shape[0])
    DISACC_DISAGG_HYBRID_FORECAST_['forecast_source'] = np.random.choice(['vf', 'ml'], DISACC_DISAGG_HYBRID_FORECAST_.shape[0])

    return DISACC_DISAGG_HYBRID_FORECAST_
    

In [238]:
DISACC_DISAGG_HYBRID_FORECAST_ = generate_DISACC_DISAGG_HYBRID_FORECAST_(IN_DATA, 
                                     CONFIG_FILE,
                                     INITIAL_GLOBAL_FILE)

In [164]:
DISACC_DISAGG_HYBRID_FORECAST_ 

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source
0,2022-06-15,80001,600002,6000002,1,4,118.185039,regular,157.780324,137.901200,161.827567,vf
1,2022-06-15,80001,600002,6000003,1,2,180.685342,regular,109.982777,179.115302,169.963072,vf
2,2022-06-15,80001,600002,6000004,1,2,180.192806,regular,130.798157,107.944546,137.659608,vf
3,2022-06-15,80001,600002,6000005,1,4,148.249495,promo,118.067031,182.743974,171.316264,ml
4,2022-06-15,80001,600002,6000006,1,3,161.287215,regular,103.087664,111.337223,165.280645,vf
...,...,...,...,...,...,...,...,...,...,...,...,...
1113855,2022-08-14,80083,600012,6000017,1,3,188.643435,promo,185.862591,173.250052,124.696086,ml
1113856,2022-08-14,80083,600012,6000018,1,1,151.560652,promo,127.373270,173.572107,152.603737,vf
1113857,2022-08-14,80083,600012,6000019,1,3,119.684192,regular,179.958605,137.909614,156.928648,ml
1113858,2022-08-14,80083,600012,6000020,1,2,168.113181,regular,103.008863,104.998049,101.857192,ml


In [113]:
def generate_DEMAND_RESTORED_(IN_DATA, 
                              CONFIG_FILE,
                              INITIAL_GLOBAL_FILE):
    # set historical dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'] , INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    DEMAND_RESTORED_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel', 'promo']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        DEMAND_RESTORED_ = pd.merge(DEMAND_RESTORED_, keys_df, 'cross')

    # fill unknown data with random numbers
    DEMAND_RESTORED_['tgt_qty_r'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['promo_flg'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['tgt_qty'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['stock_qty'] = np.random.uniform(100, 200, DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['deficit_flg1'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    DEMAND_RESTORED_['deficit_flg2'] = np.random.choice([0, 1], DEMAND_RESTORED_.shape[0])
    
    return DEMAND_RESTORED_

In [114]:
_DEMAND_RESTORED_ = generate_DEMAND_RESTORED_(IN_DATA, 
                              CONFIG_FILE,
                              INITIAL_GLOBAL_FILE)

In [115]:
_DEMAND_RESTORED_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,promo_id,tgt_qty_r,promo_flg,tgt_qty,stock_qty,deficit_flg1,deficit_flg2
0,2022-06-15,80001,600002,6000002,1,1,174.500847,0,198.745113,130.610359,1,0
1,2022-06-15,80001,600002,6000003,1,1,199.648842,1,149.884150,175.846111,1,0
2,2022-06-15,80001,600002,6000004,1,1,133.586939,0,175.465942,105.010653,1,0
3,2022-06-15,80001,600002,6000005,1,1,197.694760,1,160.798094,143.233911,1,0
4,2022-06-15,80001,600002,6000006,1,1,175.931492,0,127.236115,148.321483,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
566055,2022-07-15,80083,600012,6000017,1,1,140.483398,1,199.793022,120.473719,0,0
566056,2022-07-15,80083,600012,6000018,1,1,147.952109,0,129.013315,173.919296,0,0
566057,2022-07-15,80083,600012,6000019,1,1,135.436355,0,149.932848,162.667133,0,1
566058,2022-07-15,80083,600012,6000020,1,1,154.270507,1,128.686287,101.989865,0,0


In [110]:
def generate_FORECAST_FLAG_(IN_DATA, 
                   CONFIG_FILE,
                   INITIAL_GLOBAL_FILE):

    # set historical and forecast dates
    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    FORECAST_FLAG_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        FORECAST_FLAG_ = pd.merge(FORECAST_FLAG_, keys_df, 'cross')

    FORECAST_FLAG_['period_start_dt'] = INITIAL_GLOBAL_FILE['IB_HIST_START_DT']
    FORECAST_FLAG_['period_end_dt'] = INITIAL_GLOBAL_FILE['IB_HIST_END_DT'] + INITIAL_GLOBAL_FILE['IB_FCST_HORIZON']

    # set status ['active', 'blocked', 'out-of-sale'] for each data unit. 
    # Assume that for the forecasting period last historical status is preserved

    timerange_hist = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])
    FORECAST_FLAG_HIST = pd.DataFrame(timerange_hist, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        FORECAST_FLAG_HIST = pd.merge(FORECAST_FLAG_HIST, keys_df, 'cross')

    FORECAST_FLAG_HIST['status'] = np.random.choice(['active', 'blocked', 'out-of-sale'], FORECAST_FLAG_HIST.shape[0])
    FORECAST_FLAG_ =  pd.merge(FORECAST_FLAG_, FORECAST_FLAG_HIST, how = 'outer')

    last_vals = FORECAST_FLAG_.groupby(by = ['product_id', 'location_id', 'customer_id', 'distr_channel_id']).last()['status'].reset_index()

    missing = FORECAST_FLAG_[FORECAST_FLAG_['status'].isna()].drop(columns = 'status')
    missing = pd.merge(missing, last_vals, how = 'outer')
    
    FORECAST_FLAG_ = pd.concat([FORECAST_FLAG_[~FORECAST_FLAG_['status'].isna()], missing], ignore_index = True)
    FORECAST_FLAG_.sort_values(['period_dt', 'product_id', 'location_id', 'customer_id', 'distr_channel_id'])
    
    return FORECAST_FLAG_

In [111]:
FORECAST_FLAG_ = generate_FORECAST_FLAG_(IN_DATA, CONFIG_FILE, INITIAL_GLOBAL_FILE)

In [112]:
FORECAST_FLAG_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,period_start_dt,period_end_dt,status
0,2022-06-15,80001,600002,6000002,1,2022-06-15,2022-08-14,blocked
1,2022-06-15,80001,600002,6000003,1,2022-06-15,2022-08-14,out-of-sale
2,2022-06-15,80001,600002,6000004,1,2022-06-15,2022-08-14,active
3,2022-06-15,80001,600002,6000005,1,2022-06-15,2022-08-14,out-of-sale
4,2022-06-15,80001,600002,6000006,1,2022-06-15,2022-08-14,active
...,...,...,...,...,...,...,...,...
1113855,2022-08-10,80083,600012,6000021,1,2022-06-15,2022-08-14,active
1113856,2022-08-11,80083,600012,6000021,1,2022-06-15,2022-08-14,active
1113857,2022-08-12,80083,600012,6000021,1,2022-06-15,2022-08-14,active
1113858,2022-08-13,80083,600012,6000021,1,2022-06-15,2022-08-14,active


In [116]:
def generate_PRE_ABT_(IN_DATA, 
                      CONFIG_FILE,
                      INITIAL_GLOBAL_FILE):

    # set historical dates

    timerange = pd.date_range(INITIAL_GLOBAL_FILE['IB_HIST_START_DT'], INITIAL_GLOBAL_FILE['IB_HIST_END_DT'])

    # create df from IN_DATA values, disaggregated at out_..._lvl
    PRE_ABT_ = pd.DataFrame(timerange, columns=['period_dt'])
    for key in ['product', 'location', 'customer', 'distr_channel', 'promo']:
        column_name = f"{key}_id"
        keys_df = pd.DataFrame(IN_DATA[key][column_name]).drop_duplicates()
        PRE_ABT_ = pd.merge(PRE_ABT_, keys_df, 'cross')

    # fill unknown data with random numbers
    PRE_ABT_['tgt_qty_r'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['tgt_qty'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['stock_qty'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['deficit_flg1'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['deficit_flg2'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['promo_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['num_autorization'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['price_promo'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['promo_type1_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['promo_type2_flg'] = np.random.choice([0, 1], PRE_ABT_.shape[0])
    PRE_ABT_['price_act'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])
    PRE_ABT_['price_reg'] = np.random.uniform(100, 200, PRE_ABT_.shape[0])    

    return PRE_ABT_

In [118]:
PRE_ABT_ = generate_PRE_ABT_(IN_DATA, CONFIG_FILE, INITIAL_GLOBAL_FILE)

In [120]:
PRE_ABT_

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,promo_id,tgt_qty_r,tgt_qty,stock_qty,deficit_flg1,deficit_flg2,promo_flg,num_autorization,price_promo,promo_type1_flg,promo_type2_flg,price_act,price_reg
0,2022-06-15,80001,600002,6000002,1,1,186.821270,154.944976,142.571394,1,0,1,0,172.949918,1,0,138.546351,195.663646
1,2022-06-15,80001,600002,6000003,1,1,185.376504,126.084280,105.231834,0,1,0,0,163.427844,1,0,162.492281,186.936546
2,2022-06-15,80001,600002,6000004,1,1,196.055542,148.082464,153.653274,0,0,0,0,118.481569,0,1,185.461545,191.734127
3,2022-06-15,80001,600002,6000005,1,1,153.602219,148.072794,178.336317,1,0,1,0,154.536691,1,1,191.040069,105.340515
4,2022-06-15,80001,600002,6000006,1,1,159.637791,158.329330,152.335397,0,0,1,1,160.818720,1,1,110.839803,123.711533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566055,2022-07-15,80083,600012,6000017,1,1,108.163054,135.959388,178.986908,0,0,1,1,156.332579,0,1,116.670866,125.793307
566056,2022-07-15,80083,600012,6000018,1,1,164.633365,176.196393,128.895508,1,0,0,0,120.778653,1,0,151.678887,189.201882
566057,2022-07-15,80083,600012,6000019,1,1,138.825813,156.386839,189.168971,0,0,0,1,144.306688,1,1,169.296049,164.985796
566058,2022-07-15,80083,600012,6000020,1,1,103.233871,188.862278,127.630100,0,1,1,0,177.809595,0,0,136.743673,193.277598


# Type 1
## delete forecast for inactive period and mark forecast with missing valuesfor in-active period of assortment lifecycle 

In [239]:
# introduce noise into DISACC_DISAGG_HYBRID_FORECAST_

mask = np.random.choice([True, False], p = [0.99, 0.01], size = DISACC_DISAGG_HYBRID_FORECAST_.shape[0])

In [240]:
mask

array([ True,  True,  True, ...,  True,  True,  True], shape=(1113860,))

In [241]:
DISACC_DISAGG_HYBRID_FORECAST_.loc[~mask, 'hybrid_forecast_value'] = np.nan

In [242]:
def Type1(I1, # DISACC_DISAGG_HYBRID_FORECAST_
       I2 # FORECAST_FLAG_
      ):
    
    T1 = pd.merge(I1, I2, how = 'outer')

    # set 1 if 'status' is not 'active'
    T1['flg_apply_corr1'] = (T1['status'] != 'active').astype(int)

    # set 1 if 'hybrid_forecast_value' is not numeric
    T1['flg_apply_corr2'] = pd.to_numeric(T1['hybrid_forecast_value'], errors='coerce').isna().astype(int)

    return T1
    

In [243]:
T1 = Type1(DISACC_DISAGG_HYBRID_FORECAST_, FORECAST_FLAG_)

In [287]:
T1

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source,period_start_dt,period_end_dt,status,flg_apply_corr1,flg_apply_corr2
0,2022-06-15,80001,600002,6000002,1,1,153.434943,regular,199.907130,105.663901,193.453861,ml,2022-06-15,2022-08-14,blocked,1,0
1,2022-06-15,80001,600002,6000003,1,3,154.505227,promo,110.002313,184.457639,172.672837,ml,2022-06-15,2022-08-14,out-of-sale,1,0
2,2022-06-15,80001,600002,6000004,1,2,169.249076,promo,117.883307,102.998004,128.223102,ml,2022-06-15,2022-08-14,active,0,0
3,2022-06-15,80001,600002,6000005,1,1,132.766428,promo,193.936634,107.724877,101.591723,ml,2022-06-15,2022-08-14,out-of-sale,1,0
4,2022-06-15,80001,600002,6000006,1,4,116.401158,regular,177.903949,147.905193,101.382540,vf,2022-06-15,2022-08-14,active,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113855,2022-08-14,80083,600012,6000017,1,2,180.888302,regular,165.084098,153.964481,172.578834,ml,2022-06-15,2022-08-14,blocked,1,0
1113856,2022-08-14,80083,600012,6000018,1,3,121.999504,regular,114.783576,145.151767,115.640332,ml,2022-06-15,2022-08-14,blocked,1,0
1113857,2022-08-14,80083,600012,6000019,1,4,197.386517,promo,181.696225,122.948473,162.541969,ml,2022-06-15,2022-08-14,blocked,1,0
1113858,2022-08-14,80083,600012,6000020,1,4,124.570557,promo,163.441710,114.015217,196.252391,ml,2022-06-15,2022-08-14,active,0,0


# Type 2
## fill missing forecast values

In [282]:
def Type2(T1, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE):

    # when flg_apply_corr2 is encounntered, forecast value is replaced with either:
    # 1: average of past values for the unit (given above min number of values in min number of days)
    # 2: average of past values across similar units

    
    def group_average(unit, T1, CONFIG_PARAMETERS):
        close_dates = pd.date_range(unit['period_dt'] - pd.Timedelta(days = CONFIG_PARAMETERS['ib_npf_max_hist_depth'] - 1), unit['period_dt'])
        # fist omit 'distr_channel_id'
        similar = T1[(T1['product_id'] == unit['product_id']) & (T1['location_id'] == unit['location_id']) & (T1['customer_id'] == unit['customer_id']) & (T1['demand_type'] == unit['demand_type']) & (T1['period_dt'].isin(close_dates))].sort_values(by = 'period_dt', ascending = False)
        # next omit 'customer_id'
        similar = pd.concat([similar, 
                             T1[(T1['product_id'] == unit['product_id']) & (T1['location_id'] == unit['location_id']) & (T1['demand_type'] == unit['demand_type']) & (T1['period_dt'].isin(close_dates))].sort_values(by = 'period_dt', ascending = False)], ignore_index=True)
        # next omit 'location_id'
        similar = pd.concat([similar, 
                             T1[(T1['product_id'] == unit['product_id']) & (T1['demand_type'] == unit['demand_type']) & (T1['period_dt'].isin(close_dates))].sort_values(by = 'period_dt', ascending = False)], ignore_index=True)
        similar = similar.drop_duplicates()
        # choose 'ib_adj2_min_observ_num' (7) nearest dates
        similar = similar.dropna().head(CONFIG_PARAMETERS['ib_adj2_min_observ_num'])
        
        if similar.shape[0] == CONFIG_PARAMETERS['ib_adj2_min_observ_num']:
            return similar['hybrid_forecast_value'].mean()

    
    def own_average(unit, T1, CONFIG_PARAMETERS):
        same = T1[(T1['product_id'] == unit['product_id']) & (T1['location_id'] == unit['location_id']) & (T1['customer_id'] == unit['customer_id']) & (T1['distr_channel_id'] == unit['distr_channel_id'])  & (T1['demand_type'] == unit['demand_type']) & (T1['period_dt'] < unit['period_dt'])]
        # choose max close dates
        close_dates = pd.date_range(unit['period_dt'] - pd.Timedelta(days = CONFIG_PARAMETERS['ib_npf_max_hist_depth']), unit['period_dt'] - pd.Timedelta(days = 1))
        same = pd.merge(same, pd.DataFrame(close_dates, columns = ['period_dt']), how = 'right')
        
        # choose 'ib_adj2_min_observ_num' (7) nearest dates
        same = same.dropna().tail(CONFIG_PARAMETERS['ib_adj2_min_observ_num'])
        
        if same.shape[0] == CONFIG_PARAMETERS['ib_adj2_min_observ_num']:
            return same['hybrid_forecast_value'].mean()


    # first try own_average, then go group_average 
    part_0 = T1[(T1['flg_apply_corr2'] == 0) & ((T1['flg_apply_corr1'] == 0))]
    part_1 = T1[(T1['flg_apply_corr2'] == 1) & ((T1['flg_apply_corr1'] == 0))]
    part_1['hybrid_forecast_value'] = part_1.apply(own_average, axis = 1, args = (T1, CONFIG_PARAMETERS))
    
    part_2 = part_1[part_1['hybrid_forecast_value'].isna() == True]
    part_2['hybrid_forecast_value'] = part_2.apply(group_average, axis = 1, args = (T1, CONFIG_PARAMETERS))

    T2 = pd.concat([part_0, part_1], ignore_index = True)
    T2 = pd.concat([T2, part_1], ignore_index = True)
    T2 = T2.sort_values(by = ['period_dt', 'product_id', 'location_id', 'customer_id', 'distr_channel_id'])
    
    return T2

In [283]:
T2 = Type2(T1, 
          PRE_ABT_,
          DEMAND_RESTORED_,
          CONFIG_PARAMETERS,
          CONFIG_FILE)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part_1['hybrid_forecast_value'] = part_1.apply(own_average, axis = 1, args = (T1, CONFIG_PARAMETERS))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part_2['hybrid_forecast_value'] = part_2.apply(group_average, axis = 1, args = (T1, CONFIG_PARAMETERS))


In [286]:
T2

Unnamed: 0,period_dt,product_id,location_id,customer_id,distr_channel_id,segment_name,vf_forecast_value,demand_type,ml_forecast_value,hybrid_forecast_value,ensemble_forecast_value,forecast_source,period_start_dt,period_end_dt,status,flg_apply_corr1,flg_apply_corr2
0,2022-06-15,80001,600002,6000004,1,2,169.249076,promo,117.883307,102.998004,128.223102,ml,2022-06-15,2022-08-14,active,0,0
1,2022-06-15,80001,600002,6000006,1,4,116.401158,regular,177.903949,147.905193,101.382540,vf,2022-06-15,2022-08-14,active,0,0
2,2022-06-15,80001,600002,6000012,1,2,175.722710,promo,119.176683,171.271081,118.990485,vf,2022-06-15,2022-08-14,active,0,0
3,2022-06-15,80001,600003,6000003,1,1,173.957941,regular,125.287714,141.715229,142.355915,vf,2022-06-15,2022-08-14,active,0,0
4,2022-06-15,80001,600003,6000006,1,3,171.962435,regular,182.119305,128.629655,174.989857,ml,2022-06-15,2022-08-14,active,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368738,2022-08-14,80083,600012,6000007,1,4,181.447697,promo,174.675253,164.647522,188.965798,vf,2022-06-15,2022-08-14,active,0,0
368739,2022-08-14,80083,600012,6000008,1,1,106.462566,regular,129.402718,154.219951,162.722838,ml,2022-06-15,2022-08-14,active,0,0
368740,2022-08-14,80083,600012,6000013,1,3,188.795930,regular,143.550833,125.812592,169.330137,ml,2022-06-15,2022-08-14,active,0,0
368741,2022-08-14,80083,600012,6000020,1,4,124.570557,promo,163.441710,114.015217,196.252391,ml,2022-06-15,2022-08-14,active,0,0
